Library
library(adegenet)
library(ade4)
library(car)
library(ComplexHeatmap)
library(data.table)
library(dartR)
library(ecodist)
library(hierfstat)
library(gplots)
library(LEA)
library(lfmm)
library(maps)
library(mapplots)
library(MASS)
library(pegas)
library(poppr)
library(qvalue)
library(randomcoloR)
library(raster)
library(reshape)
library(rworldmap)
library(scales)
library(SeqArray)
library(SeqVarTools)
library(shape)
library(SNPRelate)
library(stringr)
library(vcfR)
library(vegan)
library(xtable)
library(PBSmapping)
Session information
R.version
_
platform x86_64-apple-darwin17.0
arch x86_64
os darwin17.0
system x86_64, darwin17.0
status
major 4
minor 2.2
year 2022
month 10
day 31
svn rev 83211
language R
version.string R version 4.2.2 (2022-10-31)
nickname Innocent and Trusting
sessionInfo()
R version 4.2.2 (2022-10-31)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] PBSmapping_2.73.2 xtable_1.8-4 vegan_2.6-4
[4] lattice_0.21-8 permute_0.9-7 vcfR_1.14.0
[7] stringr_1.5.0 SNPRelate_1.32.2 shape_1.4.6
[10] SeqVarTools_1.36.0 SeqArray_1.38.0 gdsfmt_1.34.1
[13] scales_1.2.1 rworldmap_1.3-6 reshape_0.8.9
[16] raster_3.6-20 sp_1.6-0 randomcoloR_1.1.0.1
[19] qvalue_2.30.0 poppr_2.9.4 pegas_1.2
[22] ape_5.7-1 MASS_7.3-59 mapplots_1.5.1
[25] maps_3.4.1 lfmm_1.1 LEA_3.10.2
[28] gplots_3.1.3 hierfstat_0.5-11 ecodist_2.0.9
[31] dartR_2.7.2 dartR.data_1.0.2 dplyr_1.1.2
[34] ggplot2_3.4.2 data.table_1.14.8 ComplexHeatmap_2.14.0
[37] car_3.1-2 carData_3.0-5 adegenet_2.1.10
[40] ade4_1.7-22 rmarkdown_2.21
loaded via a namespace (and not attached):
[1] utf8_1.2.3 R.utils_2.12.2 tidyselect_1.2.0
[4] combinat_0.0-8 Rtsne_0.16 maptools_1.1-6
[7] StAMPP_1.6.3 GWASExactHW_1.01 munsell_0.5.0
[10] codetools_0.2-19 withr_2.5.0 colorspace_2.1-0
[13] Biobase_2.58.0 knitr_1.42 stats4_4.2.2
[16] RgoogleMaps_1.4.5.3 logistf_1.24.1 GenomeInfoDbData_1.2.9
[19] gap.datasets_0.0.5 vctrs_0.6.2 generics_0.1.3
[22] xfun_0.39 R6_2.5.1 doParallel_1.0.17
[25] GenomeInfoDb_1.34.9 clue_0.3-64 fields_14.1
[28] bitops_1.0-7 cachem_1.0.8 promises_1.2.0.1
[31] pinfsc50_1.2.0 gtable_0.3.3 formula.tools_1.7.1
[34] spam_2.9-1 rlang_1.1.1 calibrate_1.7.7
[37] GlobalOptions_0.1.2 splines_4.2.2 rgdal_1.6-6
[40] broom_1.0.4 yaml_2.3.7 reshape2_1.4.4
[43] abind_1.4-5 backports_1.4.1 httpuv_1.6.9
[46] tools_4.2.2 ellipsis_0.3.2 jquerylib_0.1.4
[49] RColorBrewer_1.1-3 BiocGenerics_0.44.0 Rcpp_1.0.10
[52] plyr_1.8.8 zlibbioc_1.44.0 purrr_1.0.1
[55] RCurl_1.98-1.12 GetoptLong_1.0.5 viridis_0.6.3
[58] S4Vectors_0.36.2 cluster_2.1.4 magrittr_2.0.3
[61] genetics_1.3.8.1.3 circlize_0.4.15 mvtnorm_1.1-3
[64] matrixStats_0.63.0 patchwork_1.1.2 mime_0.12
[67] evaluate_0.20 IRanges_2.32.0 gridExtra_2.3
[70] compiler_4.2.2 tibble_3.2.1 mice_3.15.0
[73] KernSmooth_2.23-20 V8_4.3.0 crayon_1.5.2
[76] gdistance_1.6.2 R.oo_1.25.0 htmltools_0.5.5
[79] mgcv_1.8-42 later_1.3.0 tidyr_1.3.0
[82] DBI_1.1.3 PopGenReport_3.0.7 boot_1.3-28.1
[85] Matrix_1.5-4 cli_3.6.1 R.methodsS3_1.8.2
[88] gdata_2.18.0.1 parallel_4.2.2 dotCall64_1.0-2
[91] igraph_1.4.2 GenomicRanges_1.50.2 pkgconfig_2.0.3
[94] foreign_0.8-84 terra_1.7-29 foreach_1.5.2
[97] bslib_0.4.2 XVector_0.38.0 digest_0.6.31
[100] polysat_1.7-7 Biostrings_2.66.0 operator.tools_1.6.3
[103] curl_5.0.0 gap_1.5-1 shiny_1.7.4
[106] gtools_3.9.4 rjson_0.2.21 lifecycle_1.0.3
[109] nlme_3.1-162 dismo_1.3-9 jsonlite_1.8.4
[112] seqinr_4.2-30 viridisLite_0.4.2 fansi_1.0.4
[115] pillar_1.9.0 GGally_2.1.2 fastmap_1.1.1
[118] glue_1.6.2 mmod_1.3.3 png_0.1-8
[121] iterators_1.0.14 stringi_1.7.12 sass_0.4.5
[124] caTools_1.18.2
Quality control and SNP isolation
## load meta-data file
samps <- fread("samps_10Nov2020.csv")
## open GDS for common SNPs (PoolSNP)
genofile <- seqOpen("dest.all.PoolSNP.001.50.10Nov2020.ann.gds", allow.duplicate = TRUE)
## common SNP.dt
snp.dt <- data.table(chr = seqGetData(genofile, "chromosome"),
pos = seqGetData(genofile, "position"),
nAlleles = seqGetData(genofile, "$num_allele"),
id = seqGetData(genofile, "variant.id"),
genotype = seqGetData(genofile, "allele"))
snp.dt <- snp.dt[nAlleles == 2]
seqSetFilter(genofile, snp.dt$id)
## # of selected variants: 4,281,164
## filter to target
snp.tmp <- data.table(chr ="2L", pos = 3622074:3656953)
setkey(snp.tmp, chr, pos)
setkey(snp.dt, chr, pos)
seqSetFilter(genofile, variant.id=snp.dt[J(snp.tmp), nomatch = 0]$id)
## # of selected variants: 1,613
## get annotations
message("Annotations")
tmp <- seqGetData(genofile, "annotation/info/ANN")
len1 <- tmp$length
len2 <- tmp$data
snp.dt1 <- data.table(len = rep(len1, times = len1), ann = len2, id = rep(snp.dt[J(snp.tmp), nomatch = 0]$id, times = len1))
## Extract data between the 2nd and third | symbol
snp.dt1[ ,class := tstrsplit(snp.dt1$ann,"\\|")[[2]]]
snp.dt1[ ,gene := tstrsplit(snp.dt1$ann,"\\|")[[4]]]
## Collapse additional annotations to original SNP vector length
snp.dt1.an <- snp.dt1[,list(n = length(class), col = paste(class, collapse = ","), gene = paste(gene, collapse = ",")), list(variant.id = id)]
snp.dt1.an[,col := tstrsplit(snp.dt1.an$col,"\\,")[[1]]]
snp.dt1.an[,gene := tstrsplit(snp.dt1.an$gene,"\\,")[[1]]]
## get frequencies
message("Allele Freqs")
ad <- seqGetData(genofile, "annotation/format/AD")
dp <- seqGetData(genofile, "annotation/format/DP")
af <- data.table(ad = expand.grid(ad$data)[,1],
dp = expand.grid(dp)[,1],
sampleId = rep(seqGetData(genofile, "sample.id"), dim(ad$data)[2]),
variant.id = rep(seqGetData(genofile, "variant.id"), each = dim(ad$data)[1]))
## merge them together
message("merge")
afi <- merge(af, snp.dt1.an, by = "variant.id")
afi <- merge(afi, snp.dt, by.x = "variant.id", by.y = "id")
afi[ , af := ad/dp]
## calculate effective read-depth
afis <- merge(afi, samps, by = "sampleId")
afis[chr == "X", nEff := round((dp*nFlies - 1)/(dp+nFlies))]
afis[chr != "X", nEff := round((dp*2*nFlies - 1)/(dp+2*nFlies))]
afis[ ,af_nEff := round(af*nEff)/nEff]
## subsetting dataset
names(afis)
## [1] "sampleId" "variant.id" "ad" "dp"
## [5] "n" "col" "gene" "chr"
## [9] "pos" "nAlleles" "genotype" "af"
## [13] "locality" "country" "city" "collectionDate"
## [17] "DateExact" "lat" "long" "season"
## [21] "type" "continent" "set" "nFlies"
## [25] "SRA_accession" "SRA_experiment" "Model" "collector"
## [29] "sampleType" "year" "yday" "stationId"
## [33] "dist_km" "In(2L)t" "In(2R)Ns" "In(3L)P"
## [37] "In(3R)C" "In(3R)K" "In(3R)Mo" "In(3R)Payne"
## [41] "status" "bio1" "bio2" "bio3"
## [45] "bio4" "bio5" "bio6" "bio7"
## [49] "bio8" "bio9" "bio10" "bio11"
## [53] "bio12" "bio13" "bio14" "bio15"
## [57] "bio16" "bio17" "bio18" "bio19"
## [61] "tmin1" "tmin2" "tmin3" "tmin4"
## [65] "tmin5" "tmin6" "tmin7" "tmin8"
## [69] "tmin9" "tmin10" "tmin11" "tmin12"
## [73] "tmax1" "tmax2" "tmax3" "tmax4"
## [77] "tmax5" "tmax6" "tmax7" "tmax8"
## [81] "tmax9" "tmax10" "tmax11" "tmax12"
## [85] "prec1" "prec2" "prec3" "prec4"
## [89] "prec5" "prec6" "prec7" "prec8"
## [93] "prec9" "prec10" "prec11" "prec12"
## [97] "propSimNorm" "sex" "nEff" "af_nEff"
season.dat <- as.data.frame(afis[ , c(21, 30, 1, 13, 14, 15, 19, 18, 20, 2, 11, 12, 6, 22, 99)])
str(season.dat)
## 'data.frame': 438736 obs. of 15 variables:
## $ type : chr "pooled" "pooled" "pooled" "pooled" ...
## $ year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
## $ sampleId : chr "AT_Mau_14_01" "AT_Mau_14_01" "AT_Mau_14_01" "AT_Mau_14_01" ...
## $ locality : chr "AT_Mau" "AT_Mau" "AT_Mau" "AT_Mau" ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ city : chr "Mauternbach" "Mauternbach" "Mauternbach" "Mauternbach" ...
## $ long : num 15.6 15.6 15.6 15.6 15.6 ...
## $ lat : num 48.4 48.4 48.4 48.4 48.4 ...
## $ season : chr "spring" "spring" "spring" "spring" ...
## $ variant.id: int 162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
## $ genotype : chr "A,C" "A,C" "C,T" "T,C" ...
## $ af : num 1 0 0.1569 0.0196 NA ...
## $ col : chr "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ nEff : num 28 26 31 31 NA 36 36 33 24 30 ...
head(season.dat)
## type year sampleId locality country city long lat season
## 1 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 2 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 3 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 4 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 5 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 6 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## variant.id genotype af col continent nEff
## 1 162333 A,C 1.00000000 3_prime_UTR_variant Europe 28
## 2 162334 A,C 0.00000000 3_prime_UTR_variant Europe 26
## 3 162335 C,T 0.15686275 3_prime_UTR_variant Europe 31
## 4 162336 T,C 0.01960784 3_prime_UTR_variant Europe 31
## 5 162337 C,T NA 3_prime_UTR_variant Europe NA
## 6 162338 C,T 0.00000000 3_prime_UTR_variant Europe 36
dim(season.dat)
## [1] 438736 15
season.dat$season[season.dat$season == "frost"] <- "fall"
season.dat$season <- as.factor(season.dat$season)
season.filter.dat <- data.frame()
localities <- levels(as.factor(season.dat$locality))
seasons <- c("fall", "spring")
## getting samples collected at least once during spring and winter
for(loc in localities){
dft <- season.dat[season.dat$locality == loc, ]
if(all(seasons %in% unique(dft$season))){
season.filter.dat <- rbind(season.filter.dat, dft)
}
}
dim(season.filter.dat)
## [1] 285501 15
dim(na.omit(season.filter.dat))
## [1] 269454 15
str(season.filter.dat)
## 'data.frame': 285501 obs. of 15 variables:
## $ type : chr "pooled" "pooled" "pooled" "pooled" ...
## $ year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ sampleId : chr "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
## $ locality : chr "AT_gr" "AT_gr" "AT_gr" "AT_gr" ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ city : chr "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
## $ long : num 16.4 16.4 16.4 16.4 16.4 ...
## $ lat : num 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 ...
## $ season : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
## $ variant.id: int 162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
## $ genotype : chr "A,C" "A,C" "C,T" "T,C" ...
## $ af : num 0.9773 NA 0.2273 0 0.0244 ...
## $ col : chr "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ nEff : num 29 NA 29 33 28 28 27 25 27 25 ...
mat1 <- season.filter.dat[ , c(3, 6, 5, 9, 10, 14, 12, 15)]
names(mat1)
## [1] "sampleId" "city" "country" "season" "variant.id"
## [6] "continent" "af" "nEff"
str(mat1)
## 'data.frame': 285501 obs. of 8 variables:
## $ sampleId : chr "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
## $ city : chr "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ season : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
## $ variant.id: int 162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ af : num 0.9773 NA 0.2273 0 0.0244 ...
## $ nEff : num 29 NA 29 33 28 28 27 25 27 25 ...
dim(mat1)
## [1] 285501 8
mat1 <- mat1[!is.na(mat1$nEff), ]
dim(mat1)
## [1] 269454 8
mat1 <- mat1[mat1$nEff >= 28, ] ## Applying a Neff filter of 28
dim(mat1)
## [1] 212017 8
mat1 <- mat1[ , -8]
str(mat1)
## 'data.frame': 212017 obs. of 7 variables:
## $ sampleId : chr "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
## $ city : chr "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ season : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
## $ variant.id: int 162333 162335 162336 162337 162338 162350 162351 162352 162353 162357 ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ af : num 0.9773 0.2273 0 0.0244 0 ...
## Changing name of cities
mat1$city <- as.factor(mat1$city)
levels(mat1$city)
## [1] "Akaa" "Athens"
## [3] "Barcelona" "Benton Harbor"
## [5] "Broggingen" "Brzezina"
## [7] "Chalet \xe0 Gobet" "Charlottesville"
## [9] "Charlotttesville" "Chornobyl (Cooling pond)"
## [11] "Chornobyl Applegarden" "Chornobyl Applegarden (New)"
## [13] "Chornobyl Kopachi" "Chornobyl Polisske"
## [15] "Chornobyl Yaniv" "Cross Plains"
## [17] "Drogobych" "Esparto"
## [19] "Gimenells" "Gotheron"
## [21] "Gross-Enzersdorf" "Homestead"
## [23] "Ithaca" "Kalna"
## [25] "Karensminde" "Karensminde Orchard"
## [27] "Kharkiv" "Kyiv"
## [29] "Kyiv (Vyshneve)" "Lancaster"
## [31] "Linvilla" "Market Harborough"
## [33] "Mauternbach" "Munich"
## [35] "Odesa" "Odessa"
## [37] "Sheffield" "Slankamen. Vinogradi"
## [39] "State College" "Sudbury"
## [41] "Topeka" "Tuolumne"
## [43] "valday" "Valday"
## [45] "Viltain" "Yesiloz"
## [47] "Yesiloz TR13" "Yesiloz TR14"
## [49] "Yesiloz TR15" "Yesiloz TR16"
## [51] "Yesiloz TR17" "Yesiloz TR18"
levels(mat1$city) <- gsub("valday", "Valday", levels(mat1$city))
levels(mat1$city) <- gsub("Odesa", "Odessa", levels(mat1$city))
levels(mat1$city) <- gsub("Charlotttesville", "Charlottesville", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR13", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR14", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR15", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR16", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR17", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR18", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Karensminde Orchard", "Karensminde", levels(mat1$city))
levels(mat1$city) <- gsub("Slankamen. Vinogradi", "Slankamen-Vinogradi", levels(mat1$city))
levels(mat1$city) <- gsub("Chalet \xe0 Gobet", "Chalet-A-Gobet", levels(mat1$city))
levels(mat1$city)
## [1] "Akaa" "Athens"
## [3] "Barcelona" "Benton Harbor"
## [5] "Broggingen" "Brzezina"
## [7] "Chalet-A-Gobet" "Charlottesville"
## [9] "Chornobyl (Cooling pond)" "Chornobyl Applegarden"
## [11] "Chornobyl Applegarden (New)" "Chornobyl Kopachi"
## [13] "Chornobyl Polisske" "Chornobyl Yaniv"
## [15] "Cross Plains" "Drogobych"
## [17] "Esparto" "Gimenells"
## [19] "Gotheron" "Gross-Enzersdorf"
## [21] "Homestead" "Ithaca"
## [23] "Kalna" "Karensminde"
## [25] "Kharkiv" "Kyiv"
## [27] "Kyiv (Vyshneve)" "Lancaster"
## [29] "Linvilla" "Market Harborough"
## [31] "Mauternbach" "Munich"
## [33] "Odessa" "Sheffield"
## [35] "Slankamen-Vinogradi" "State College"
## [37] "Sudbury" "Topeka"
## [39] "Tuolumne" "Valday"
## [41] "Viltain" "Yesiloz"
levels(mat1$city)[c(9, 10, 11, 12, 13, 14)] <- "Chornobyl"
levels(mat1$city)
## [1] "Akaa" "Athens" "Barcelona"
## [4] "Benton Harbor" "Broggingen" "Brzezina"
## [7] "Chalet-A-Gobet" "Charlottesville" "Chornobyl"
## [10] "Cross Plains" "Drogobych" "Esparto"
## [13] "Gimenells" "Gotheron" "Gross-Enzersdorf"
## [16] "Homestead" "Ithaca" "Kalna"
## [19] "Karensminde" "Kharkiv" "Kyiv"
## [22] "Kyiv (Vyshneve)" "Lancaster" "Linvilla"
## [25] "Market Harborough" "Mauternbach" "Munich"
## [28] "Odessa" "Sheffield" "Slankamen-Vinogradi"
## [31] "State College" "Sudbury" "Topeka"
## [34] "Tuolumne" "Valday" "Viltain"
## [37] "Yesiloz"
levels(mat1$city)[22] <- "Kyiv"
levels(mat1$city)
## [1] "Akaa" "Athens" "Barcelona"
## [4] "Benton Harbor" "Broggingen" "Brzezina"
## [7] "Chalet-A-Gobet" "Charlottesville" "Chornobyl"
## [10] "Cross Plains" "Drogobych" "Esparto"
## [13] "Gimenells" "Gotheron" "Gross-Enzersdorf"
## [16] "Homestead" "Ithaca" "Kalna"
## [19] "Karensminde" "Kharkiv" "Kyiv"
## [22] "Lancaster" "Linvilla" "Market Harborough"
## [25] "Mauternbach" "Munich" "Odessa"
## [28] "Sheffield" "Slankamen-Vinogradi" "State College"
## [31] "Sudbury" "Topeka" "Tuolumne"
## [34] "Valday" "Viltain" "Yesiloz"
levels(mat1$city)[29] <- "Slankamenacki-Vinogradi"
levels(mat1$city)
## [1] "Akaa" "Athens"
## [3] "Barcelona" "Benton Harbor"
## [5] "Broggingen" "Brzezina"
## [7] "Chalet-A-Gobet" "Charlottesville"
## [9] "Chornobyl" "Cross Plains"
## [11] "Drogobych" "Esparto"
## [13] "Gimenells" "Gotheron"
## [15] "Gross-Enzersdorf" "Homestead"
## [17] "Ithaca" "Kalna"
## [19] "Karensminde" "Kharkiv"
## [21] "Kyiv" "Lancaster"
## [23] "Linvilla" "Market Harborough"
## [25] "Mauternbach" "Munich"
## [27] "Odessa" "Sheffield"
## [29] "Slankamenacki-Vinogradi" "State College"
## [31] "Sudbury" "Topeka"
## [33] "Tuolumne" "Valday"
## [35] "Viltain" "Yesiloz"
dim(mat1)
## [1] 212017 7
str(mat1)
## 'data.frame': 212017 obs. of 7 variables:
## $ sampleId : chr "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
## $ city : Factor w/ 36 levels "Akaa","Athens",..: 15 15 15 15 15 15 15 15 15 15 ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ season : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
## $ variant.id: int 162333 162335 162336 162337 162338 162350 162351 162352 162353 162357 ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ af : num 0.9773 0.2273 0 0.0244 0 ...
mat1 <- mat1[!mat1$country == "United Kingdom", ] ## The united kingdom only has 4 pooled samples, so I removed it from the analyses
mat2 <- reshape(mat1, idvar = c("sampleId", "city", "country", "season", "continent"), timevar = "variant.id", direction = "wide")
mat2[1:6, 1:10]
## sampleId city country season continent af.162333
## 9679 AT_gr_12_fall Gross-Enzersdorf Austria fall Europe 0.9772727
## 11292 AT_gr_12_spring Gross-Enzersdorf Austria spring Europe 0.8888889
## 1 AT_Mau_14_01 Mauternbach Austria spring Europe 1.0000000
## 1617 AT_Mau_14_02 Mauternbach Austria fall Europe NA
## 3227 AT_Mau_15_50 Mauternbach Austria spring Europe 0.7714286
## 4840 AT_Mau_15_51 Mauternbach Austria fall Europe 0.6984127
## af.162335 af.162336 af.162337 af.162338
## 9679 0.2272727 0.00000000 0.02439024 0
## 11292 NA 0.00000000 NA NA
## 1 0.1568627 0.01960784 NA 0
## 1617 NA 0.00000000 0.00000000 0
## 3227 0.1323529 0.00000000 0.00000000 0
## 4840 0.2656250 0.01282051 0.00000000 0
dim(mat2)
## [1] 170 1614
mat2$city <- as.character(mat2$city)
mat2$season <- as.character(mat2$season)
mat2 <- mat2[mat2$city != "Homestead", ]
mat2 <- mat2[mat2$city != "Athens", ]
mat2 <- mat2[mat2$city != "Sudbury", ]
mat2 <- mat2[mat2$city != "Brzezina", ]
mat2 <- mat2[mat2$city != "Kalna", ]
mat2 <- mat2[mat2$city != "Slankamenacki-Vinogradi", ]
mat2 <- mat2[mat2$city != "Topeka", ]
mat2 <- mat2[mat2$city != "Chalet-A-Gobet", ]
usa <- c("Lancaster", "Ithaca", "Cross Plains", "Benton Harbor", "Linvilla", "State College", "Tuolumne", "Esparto", "Charlottesville")
for(i in usa){
mat2$country[mat2$city == i] <- i
}
mat2$country <- as.factor(mat2$country)
levels(mat2$country)
## [1] "Austria" "Benton Harbor" "Charlottesville" "Cross Plains"
## [5] "Denmark" "Esparto" "Finland" "France"
## [9] "Germany" "Ithaca" "Lancaster" "Linvilla"
## [13] "Russia" "Spain" "State College" "Tuolumne"
## [17] "Turkey" "Ukraine"
levels(mat2$country)[c(2, 4)] <- "MI-WI"
levels(mat2$country)
## [1] "Austria" "MI-WI" "Charlottesville" "Denmark"
## [5] "Esparto" "Finland" "France" "Germany"
## [9] "Ithaca" "Lancaster" "Linvilla" "Russia"
## [13] "Spain" "State College" "Tuolumne" "Turkey"
## [17] "Ukraine"
levels(mat2$country)[c(9, 10)] <- "NY-MA"
levels(mat2$country)
## [1] "Austria" "MI-WI" "Charlottesville" "Denmark"
## [5] "Esparto" "Finland" "France" "Germany"
## [9] "NY-MA" "Linvilla" "Russia" "Spain"
## [13] "State College" "Tuolumne" "Turkey" "Ukraine"
levels(mat2$country)[c(10, 13)] <- "PA"
levels(mat2$country)
## [1] "Austria" "MI-WI" "Charlottesville" "Denmark"
## [5] "Esparto" "Finland" "France" "Germany"
## [9] "NY-MA" "PA" "Russia" "Spain"
## [13] "Tuolumne" "Turkey" "Ukraine"
mat3 <- as.data.frame(getGenotypeAlleles(genofile)) ## extracting genotypes from the GDS file
mat3[1:5, 1:5]
## 162333 162334 162335 162336 162337
## AT_Mau_14_01 C/C A/A C/T T/C <NA>
## AT_Mau_14_02 A/C A/A C/T T/T C/C
## AT_Mau_15_50 A/C A/A C/T T/T C/C
## AT_Mau_15_51 A/C <NA> C/T T/C C/C
## AT_See_14_44 A/C A/C C/T T/T C/C
for(i in 1:ncol(mat3)){
mat3[ , i] <- gsub("/", "", mat3[, i])
}
mat3[1:5, 1:5]
## 162333 162334 162335 162336 162337
## AT_Mau_14_01 CC AA CT TC <NA>
## AT_Mau_14_02 AC AA CT TT CC
## AT_Mau_15_50 AC AA CT TT CC
## AT_Mau_15_51 AC <NA> CT TC CC
## AT_See_14_44 AC AC CT TT CC
mat3$ind <- rownames(mat3)
dim(mat3)
## [1] 272 1614
mat3[1:5, 1610:1614]
## 163994 163995 163996 163997 ind
## AT_Mau_14_01 GA GG GC AA AT_Mau_14_01
## AT_Mau_14_02 GA GG GC AT AT_Mau_14_02
## AT_Mau_15_50 GA GG GC AA AT_Mau_15_50
## AT_Mau_15_51 GA GG GC AA AT_Mau_15_51
## AT_See_14_44 GA GA GC AA AT_See_14_44
mat4 <- mat3[mat3$ind %in% mat2$sampleId, ]
dim(mat4)
## [1] 152 1614
mat4[1:5, 1600:1614]
## 163983 163984 163985 163986 163987 163988 163989 163991 163992
## AT_Mau_14_01 AA AG TT CG AG CC AC CC AG
## AT_Mau_14_02 AA AG TT CG AG CC AC CC AA
## AT_Mau_15_50 AA AG TC CG AG CT AC CG AG
## AT_Mau_15_51 AA AG TT CG AG CC AC CG AA
## AT_gr_12_fall AA AG TT CG AG CT AC CG AA
## 163993 163994 163995 163996 163997 ind
## AT_Mau_14_01 TC GA GG GC AA AT_Mau_14_01
## AT_Mau_14_02 TC GA GG GC AT AT_Mau_14_02
## AT_Mau_15_50 TC GA GG GC AA AT_Mau_15_50
## AT_Mau_15_51 TC GA GG GC AA AT_Mau_15_51
## AT_gr_12_fall TC GA GG GC AA AT_gr_12_fall
same_ord <- mat2[ , c(1, 2, 3, 4)]
colnames(same_ord) <- c("ind", "city", "country", "season")
mat5 <- merge(mat4, same_ord, by = "ind")
dim(mat5)
## [1] 152 1617
mat5[1:5, 1:5]
## ind 162333 162334 162335 162336
## 1 AT_gr_12_fall AC <NA> CT TT
## 2 AT_gr_12_spring AC AA CT TT
## 3 AT_Mau_14_01 CC AA CT TC
## 4 AT_Mau_14_02 AC AA CT TT
## 5 AT_Mau_15_50 AC AA CT TT
rownames(mat5) <- mat5[ , 1]
dim(mat5)
## [1] 152 1617
mat5[1:5, 1610:1617]
## 163993 163994 163995 163996 163997 city country
## AT_gr_12_fall TC GA GG GC AA Gross-Enzersdorf Austria
## AT_gr_12_spring TC GA GG GC AA Gross-Enzersdorf Austria
## AT_Mau_14_01 TC GA GG GC AA Mauternbach Austria
## AT_Mau_14_02 TC GA GG GC AT Mauternbach Austria
## AT_Mau_15_50 TC GA GG GC AA Mauternbach Austria
## season
## AT_gr_12_fall fall
## AT_gr_12_spring spring
## AT_Mau_14_01 spring
## AT_Mau_14_02 fall
## AT_Mau_15_50 spring
mat6 <- mat5[ , -c(1, 1615, 1616, 1617)]
dim(mat6)
## [1] 152 1613
mat6[1:5, 1:5]
## 162333 162334 162335 162336 162337
## AT_gr_12_fall AC <NA> CT TT CT
## AT_gr_12_spring AC AA CT TT CC
## AT_Mau_14_01 CC AA CT TC <NA>
## AT_Mau_14_02 AC AA CT TT CC
## AT_Mau_15_50 AC AA CT TT CC
mat6[1:5, 1605:1613]
## 163988 163989 163991 163992 163993 163994 163995 163996 163997
## AT_gr_12_fall CT AC CG AA TC GA GG GC AA
## AT_gr_12_spring CC AA CG AA TC GA GG GC AA
## AT_Mau_14_01 CC AC CC AG TC GA GG GC AA
## AT_Mau_14_02 CC AC CC AA TC GA GG GC AT
## AT_Mau_15_50 CT AC CG AG TC GA GG GC AA
fly_gen <- df2genind(mat6, ploidy = 2, ind.names = rownames(mat6), pop = mat5$country, sep = "")
fly_gen
## /// GENIND OBJECT /////////
##
## // 152 individuals; 1,613 loci; 3,225 alleles; size: 2.7 Mb
##
## // Basic content
## @tab: 152 x 3225 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 1-2)
## @loc.fac: locus factor for the 3225 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: df2genind(X = mat6, sep = "", ind.names = rownames(mat6), pop = mat5$country,
## ploidy = 2)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
fly_gen$tab[1:5, 1:5]
## 162333.A 162333.C 162334.A 162334.C 162335.C
## AT_gr_12_fall 1 1 NA NA 1
## AT_gr_12_spring 1 1 2 0 1
## AT_Mau_14_01 0 2 2 0 1
## AT_Mau_14_02 1 1 2 0 1
## AT_Mau_15_50 1 1 2 0 1
summary(fly_gen$pop)
## Austria MI-WI Charlottesville Denmark Esparto
## 6 9 6 5 5
## Finland France Germany NY-MA PA
## 6 10 12 6 18
## Russia Spain Tuolumne Turkey Ukraine
## 6 7 5 13 38
indmiss_fly <- propTyped(fly_gen, by = "ind")
indmiss_fly[which(indmiss_fly < 0.80)]
## PA_li_10_spring
## 0.7123373
barplot(indmiss_fly, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Genotypes", side = 1, line = 3.7)
fly_gen <- missingno(fly_gen, type = "geno", cutoff = 0.20) ## This is the genind file that I will use for all the analyses
fly_gen
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,613 loci; 3,225 alleles; size: 2.7 Mb
##
## // Basic content
## @tab: 151 x 3225 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 1-2)
## @loc.fac: locus factor for the 3225 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
## quality control begins here
mlg(fly_gen) ## keep only polymorphic loci
## #############################
## # Number of Individuals: 151
## # Number of MLG: 151
## #############################
## [1] 151
isPoly(fly_gen) %>% summary
## Mode FALSE TRUE
## logical 1 1612
poly_loci <- names(which(isPoly(fly_gen) == TRUE))
fly_gen2 <- fly_gen[loc = poly_loci]
isPoly(fly_gen2) %>% summary
## Mode TRUE
## logical 1612
fly_gen2$loc.n.all <- fly_gen2$loc.n.all[which(fly_gen2$loc.n.all == 2)]
n <- names(which(nAll(fly_gen2) == 2))
fly_gen2 <- fly_gen2[loc = n]
fly_gen2
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,612 loci; 3,224 alleles; size: 2.7 Mb
##
## // Basic content
## @tab: 151 x 3224 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3224 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
indmiss_fly2 <- propTyped(fly_gen2, by = "ind")
barplot(indmiss_fly2, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Genotypes", side = 1, line = 3.7)
locmiss_fly <- propTyped(fly_gen2, by = "loc")
locmiss_fly[which(locmiss_fly < 0.80)] ## print loci with < 80% complete genotypes
## 162334 162396 162412 162545 162546 162547 162549 162551
## 0.6225166 0.6092715 0.7880795 0.7947020 0.7682119 0.7947020 0.7748344 0.7880795
## 162553 162554 162555 162557 162560 162563 162587 162623
## 0.7880795 0.7814570 0.7880795 0.7947020 0.7814570 0.7947020 0.7748344 0.5298013
## 162746 162755 162756 162882 162892 162986 162987 162988
## 0.4238411 0.5562914 0.5562914 0.7748344 0.4238411 0.7152318 0.7152318 0.7284768
## 162989 162990 162992 163029 163072 163073 163074 163075
## 0.7284768 0.7284768 0.7549669 0.6291391 0.4834437 0.4370861 0.4834437 0.4834437
## 163076 163118 163119 163120 163121 163122 163123 163124
## 0.4834437 0.6688742 0.6688742 0.6688742 0.6688742 0.6688742 0.6158940 0.6158940
## 163125 163307 163363 163365 163366 163608 163648 163650
## 0.4370861 0.4768212 0.7947020 0.7947020 0.7483444 0.7350993 0.5165563 0.5827815
## 163692 163715 163716 163717 163765 163821 163908 163910
## 0.4569536 0.7086093 0.7086093 0.7086093 0.6357616 0.6092715 0.5231788 0.5629139
## 163914 163915 163916 163917 163922 163958
## 0.6158940 0.5894040 0.5827815 0.5761589 0.6953642 0.7019868
barplot(locmiss_fly, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Locus", side = 1, line = 3)
fly_gen3 <- missingno(fly_gen2, type = "loci", cutoff = 0.20)
fly_gen3
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
##
## // Basic content
## @tab: 151 x 3100 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3100 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
locmiss_fly3 <- propTyped(fly_gen3, by = "loc")
locmiss_fly3[which(locmiss_fly3 < 0.80)]
## named numeric(0)
fly_gen3$tab[1:5, 1:5]
## 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall 1 1 1 1 2
## AT_gr_12_spring 1 1 1 1 2
## AT_Mau_14_01 0 2 1 1 1
## AT_Mau_14_02 1 1 1 1 2
## AT_Mau_15_50 1 1 1 1 2
min(ploidy(fly_gen3))
## [1] 2
max(ploidy(fly_gen3))
## [1] 2
summary(fly_gen3$pop)
## Austria MI-WI Charlottesville Denmark Esparto
## 6 9 6 5 5
## Finland France Germany NY-MA PA
## 6 10 12 6 17
## Russia Spain Tuolumne Turkey Ukraine
## 6 7 5 13 38
mlg(fly_gen3)
## #############################
## # Number of Individuals: 151
## # Number of MLG: 151
## #############################
## [1] 151
isPoly(fly_gen3) %>% summary
## Mode TRUE
## logical 1550
poly_loci3 <- names(which(isPoly(fly_gen3) == TRUE))
fly_gen3 <- fly_gen3[loc = poly_loci3]
isPoly(fly_gen3) %>% summary
## Mode TRUE
## logical 1550
barplot(locmiss_fly3, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Locus", side = 1, line = 3)
## computing basic stats
basic_fly <- basic.stats(fly_gen3, diploid = TRUE)
## testing for H-W equilibrium
## Chi-squared test: p-value
HWE.test <- data.frame(sapply(seppop(fly_gen3),
function(ls) pegas::hw.test(ls, B = 0)[ , 3]))
HWE.test.chisq <- t(data.matrix(HWE.test))
HWE.test.chisq[1:5, 1:5]
## 162333 162335 162336 162337 162338
## Austria 0.08018122 0.01430588 0.6242061 0.8037847 1.0000000
## MI.WI 0.08968602 0.04722090 1.0000000 1.0000000 0.5485062
## Charlottesville 0.23013934 0.02534732 1.0000000 0.8237839 0.8037847
## Denmark 0.33790402 0.02534732 1.0000000 0.5761501 1.0000000
## Esparto 0.23013934 0.04550026 1.0000000 0.7750970 0.8037847
## Monte Carlo: p-value
HWE.test <- data.frame(sapply(seppop(fly_gen3),
function(ls) pegas::hw.test(ls, B = 1000)[ , 4]))
HWE.test.MC <- t(data.matrix(HWE.test))
HWE.test.MC[1:5, 1:5]
## 162333 162335 162336 162337 162338
## Austria 0.398 0.092 1 1 1
## MI.WI 0.425 0.169 1 1 1
## Charlottesville 1.000 0.132 1 1 1
## Denmark 1.000 0.131 1 1 1
## Esparto 1.000 0.330 1 1 1
alpha <- 0.05
Prop.loci.out.of.HWE <- data.frame(Chisq = apply(HWE.test.chisq < alpha, 2, mean), MC = apply(HWE.test.MC < alpha, 2, mean))
Prop.loci.out.of.HWE[1:10, ]
## Chisq MC
## 162333 0.3333333 0.2666667
## 162335 1.0000000 0.4000000
## 162336 0.0000000 0.0000000
## 162337 0.0000000 0.0000000
## 162338 0.0000000 0.0000000
## 162339 0.0000000 0.0000000
## 162340 0.0000000 0.0000000
## 162341 1.0000000 0.4666667
## 162342 0.0000000 0.0000000
## 162343 0.0000000 0.0000000
## "False discovery rate" correction for the number of tests. Here I use the function ‘p.adjust’ with the argument method = "fdr" to adjust the p-values from the previous tests
Chisq.fdr <- matrix(p.adjust(HWE.test.chisq, method = "fdr"),
nrow = nrow(HWE.test.chisq))
MC.fdr <- matrix(p.adjust(HWE.test.MC, method = "fdr"),
nrow = nrow(HWE.test.MC))
Prop.loci.out.of.HWE <- data.frame(Chisq = apply(HWE.test.chisq<alpha, 2, mean),
MC = apply(HWE.test.MC<alpha, 2, mean),
Chisq.fdr = apply(Chisq.fdr<alpha, 2, mean),
MC.fdr = apply(MC.fdr<alpha, 2, mean))
head(Prop.loci.out.of.HWE)
## Chisq MC Chisq.fdr MC.fdr
## 162333 0.3333333 0.2666667 0.2666667 0.06666667
## 162335 1.0000000 0.4000000 0.6000000 0.33333333
## 162336 0.0000000 0.0000000 0.0000000 0.00000000
## 162337 0.0000000 0.0000000 0.0000000 0.00000000
## 162338 0.0000000 0.0000000 0.0000000 0.00000000
## 162339 0.0000000 0.0000000 0.0000000 0.00000000
loci <- data.frame()
idx <- 1
for(i in 1:nrow(Prop.loci.out.of.HWE)){
fdr <- Prop.loci.out.of.HWE[i , ]
if(fdr$MC.fdr > 0.5){
loci <- rbind(fdr, loci)
}
}
loci ## none of the loci are consistenly out of HWE. Some of them are out of HWE in less than 50% of the populations.
## data frame with 0 columns and 0 rows
dim(loci)
## [1] 0 0
## check for linkage disequilibrium (LD)
LD <- as.genclone(fly_gen3)
LD
##
## This is a genclone object
## -------------------------
## Genotype information:
##
## 151 original multilocus genotypes
## 151 diploid individuals
## 1550 codominant loci
##
## Population information:
##
## 0 strata.
## 15 populations defined -
## Austria, MI-WI, Charlottesville, ..., Tuolumne, Turkey, Ukraine
quartz()
LD.general <- poppr::ia(LD, sample = 500)
LD.general
## Ia p.Ia rbarD p.rD
## 15.592940300 0.001996008 0.013771123 0.001996008
Computing pairwise Fst test
fly_fst <- genet.dist(fly_gen3, method = "WC84") %>% round(digits = 3)
fst.mat <- as.matrix(fly_fst)
fst.mat[fst.mat < 0] <- 0
fst.mat[1:10, 1:10]
## Austria MI-WI Charlottesville Denmark Esparto Finland France
## Austria 0.000 0.053 0.058 0.018 0.056 0.015 0.021
## MI-WI 0.053 0.000 0.005 0.049 0.012 0.049 0.042
## Charlottesville 0.058 0.005 0.000 0.054 0.017 0.057 0.045
## Denmark 0.018 0.049 0.054 0.000 0.049 0.024 0.018
## Esparto 0.056 0.012 0.017 0.049 0.000 0.054 0.041
## Finland 0.015 0.049 0.057 0.024 0.054 0.000 0.022
## France 0.021 0.042 0.045 0.018 0.041 0.022 0.000
## Germany 0.012 0.041 0.046 0.016 0.043 0.013 0.010
## NY-MA 0.044 0.006 0.003 0.040 0.009 0.044 0.033
## PA 0.053 0.004 0.004 0.050 0.013 0.051 0.044
## Germany NY-MA PA
## Austria 0.012 0.044 0.053
## MI-WI 0.041 0.006 0.004
## Charlottesville 0.046 0.003 0.004
## Denmark 0.016 0.040 0.050
## Esparto 0.043 0.009 0.013
## Finland 0.013 0.044 0.051
## France 0.010 0.033 0.044
## Germany 0.000 0.034 0.041
## NY-MA 0.034 0.000 0.002
## PA 0.041 0.002 0.000
tab <- gi2gl(fly_gen3, parallel = FALSE, verbose = NULL) ## converting genind object to genlight object
## Starting gi2gl
## Starting gl.compliance.check
## Processing genlight object with SNP data
## Checking coding of SNPs
## SNP data scored NA, 0, 1 or 2 confirmed
## Checking locus metrics and flags
## Recalculating locus metrics
## Checking for monomorphic loci
## No monomorphic loci detected
## Checking for loci with all missing data
## No loci with all missing data detected
## Checking whether individual names are unique.
## Checking for individual metrics
## Warning: Creating a slot for individual metrics
## Checking for population assignments
## Population assignments confirmed
## Spelling of coordinates checked and changed if necessary to
## lat/lon
## Completed: gl.compliance.check
## Completed: gi2gl
pval_fst <- gl.fst.pop(tab, nboots = 1000, percent = 95, nclusters = 1, verbose = NULL)
## Starting gl.fst.pop
## Processing genlight object with SNP data
## Completed: gl.fst.pop
fst <- as.matrix(as.dist(pval_fst$Fsts))
fst[fst < 0] <- 0
fst
## Austria Esparto Tuolumne Germany Denmark
## Austria 0.00000000 0.055703474 0.058895752 0.012410932 0.01800624
## Esparto 0.05570347 0.000000000 0.000000000 0.043366976 0.04899978
## Tuolumne 0.05889575 0.000000000 0.000000000 0.046380979 0.05326210
## Germany 0.01241093 0.043366976 0.046380979 0.000000000 0.01632085
## Denmark 0.01800624 0.048999783 0.053262097 0.016320855 0.00000000
## Spain 0.01644031 0.044356991 0.047779471 0.007609145 0.02535346
## Finland 0.01510726 0.054481851 0.060073773 0.013480562 0.02446933
## France 0.02086204 0.040896003 0.045503573 0.009973570 0.01796563
## NY-MA 0.04387058 0.008737692 0.007955468 0.033551239 0.04003517
## MI-WI 0.05336238 0.012416407 0.008962355 0.041320541 0.04880945
## PA 0.05310220 0.013471651 0.009573028 0.041313299 0.05015960
## Russia 0.01689966 0.053516381 0.060654501 0.020656074 0.02332836
## Turkey 0.01412191 0.060541318 0.062921149 0.018900400 0.02179383
## Ukraine 0.01416309 0.063774691 0.069481577 0.020327235 0.02369749
## Charlottesville 0.05819820 0.017272103 0.015222898 0.046035247 0.05402774
## Spain Finland France NY-MA MI-WI
## Austria 0.016440309 0.01510726 0.02086204 0.043870575 0.053362379
## Esparto 0.044356991 0.05448185 0.04089600 0.008737692 0.012416407
## Tuolumne 0.047779471 0.06007377 0.04550357 0.007955468 0.008962355
## Germany 0.007609145 0.01348056 0.00997357 0.033551239 0.041320541
## Denmark 0.025353461 0.02446933 0.01796563 0.040035167 0.048809450
## Spain 0.000000000 0.02001428 0.01567832 0.029541647 0.039313950
## Finland 0.020014275 0.00000000 0.02212788 0.043508846 0.048678641
## France 0.015678321 0.02212788 0.00000000 0.033457193 0.042244680
## NY-MA 0.029541647 0.04350885 0.03345719 0.000000000 0.005774646
## MI-WI 0.039313950 0.04867864 0.04224468 0.005774646 0.000000000
## PA 0.038958914 0.05145801 0.04367546 0.001936861 0.004351338
## Russia 0.031639501 0.01913517 0.02199934 0.047419819 0.052957069
## Turkey 0.026809159 0.01677821 0.02823027 0.049358359 0.054304445
## Ukraine 0.031831900 0.02245250 0.02881299 0.055798661 0.064183546
## Charlottesville 0.046471038 0.05728551 0.04450952 0.002951612 0.004506388
## PA Russia Turkey Ukraine Charlottesville
## Austria 0.053102204 0.016899657 0.01412191 0.014163086 0.058198201
## Esparto 0.013471651 0.053516381 0.06054132 0.063774691 0.017272103
## Tuolumne 0.009573028 0.060654501 0.06292115 0.069481577 0.015222898
## Germany 0.041313299 0.020656074 0.01890040 0.020327235 0.046035247
## Denmark 0.050159598 0.023328359 0.02179383 0.023697489 0.054027742
## Spain 0.038958914 0.031639501 0.02680916 0.031831900 0.046471038
## Finland 0.051458012 0.019135166 0.01677821 0.022452499 0.057285510
## France 0.043675456 0.021999335 0.02823027 0.028812985 0.044509519
## NY-MA 0.001936861 0.047419819 0.04935836 0.055798661 0.002951612
## MI-WI 0.004351338 0.052957069 0.05430445 0.064183546 0.004506388
## PA 0.000000000 0.058255707 0.05610334 0.066041832 0.004019586
## Russia 0.058255707 0.000000000 0.01717863 0.007515034 0.059645911
## Turkey 0.056103340 0.017178630 0.00000000 0.012939846 0.058036081
## Ukraine 0.066041832 0.007515034 0.01293985 0.000000000 0.067300473
## Charlottesville 0.004019586 0.059645911 0.05803608 0.067300473 0.000000000
## png("heatmap.png", width = 7, height = 7, units = "in", res = 300)
## pdf("heatmap.pdf")
my_palette <- colorRampPalette(c("blue", "red"))(n = 100)
heatmap.2(fst.mat, density.info = "none", trace = "none", scale = "none", cexRow = 0.7, cexCol = 0.7, key.title = "Fst", srtCol = 45, srtRow = 35, margins = c(8.5, 5), col = my_palette, symbreaks = TRUE)
## dev.off()
Figure 1. Heatmap depicting genetic differentiation among populations of D. melanogaster based on pairwise Fst values. In some cases, samples from adjacent localities were combined to avoid low sample sizes. Abbreviations are as follows: PA = Pennsylvania; MI-WI = Michigan and Wisconsin; NY-MA = New York and Massachusetts.
Genetic admixture proportions
##seqSetFilter(genofile, variant.id = rownames(snps), sample.id = rownames(indi))
##seqGDS2VCF(genofile, "my_vcf.gz", verbose = TRUE)
##outest <- read.vcfR("my_vcf.gz")
##dataout <- vcfR2genind(outest)
## exploring the data once again (do it all the time!)
fly_gen3
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
##
## // Basic content
## @tab: 151 x 3100 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3100 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
fly_gen3$tab[1:5, 1:5]
## 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall 1 1 1 1 2
## AT_gr_12_spring 1 1 1 1 2
## AT_Mau_14_01 0 2 1 1 1
## AT_Mau_14_02 1 1 1 1 2
## AT_Mau_15_50 1 1 1 1 2
nLoc(fly_gen3) # number of loci
## [1] 1550
nPop(fly_gen3) # number of sites
## [1] 15
nInd(fly_gen3) # number of individuals
## [1] 151
summary(fly_gen3$pop) # sample size
## Austria MI-WI Charlottesville Denmark Esparto
## 6 9 6 5 5
## Finland France Germany NY-MA PA
## 6 10 12 6 17
## Russia Spain Tuolumne Turkey Ukraine
## 6 7 5 13 38
min(ploidy(fly_gen3))
## [1] 2
max(ploidy(fly_gen3))
## [1] 2
## gl2geno(tab, outfile = "gl_geno", outpath = getwd(), verbose = NULL) ## Converting genlight object to geno object
## Run snmf algorithm
#snmf1 <- snmf("gl_geno.geno",
# K = 1:10, # number of K ancestral populations to run
# repetitions = 50, # 50 repetitions for each K
# entropy = TRUE, # calculate cross-entropy
# project = "new")
snmf1 <- load.snmfProject("gl_geno.snmfProject")
## extract Q-matrix for the best run
plot(snmf1, col = "blue", cex = 1.5, pch = 19, las = 1)
## extract the cross-entropy of all runs where K = 9
ce <- cross.entropy(snmf1, K = 9)
ce
## K = 9
## run 1 0.3026330
## run 2 0.3006478
## run 3 0.2965800
## run 4 0.2934025
## run 5 0.2985019
## run 6 0.3038042
## run 7 0.3018757
## run 8 0.2907803
## run 9 0.2968350
## run 10 0.2961763
## run 11 0.2993841
## run 12 0.2929597
## run 13 0.2910719
## run 14 0.2818298
## run 15 0.2994594
## run 16 0.2894918
## run 17 0.2949548
## run 18 0.2914579
## run 19 0.2884607
## run 20 0.3000826
## run 21 0.2908541
## run 22 0.2981863
## run 23 0.2902612
## run 24 0.2904714
## run 25 0.3029123
## run 26 0.3011128
## run 27 0.2925949
## run 28 0.2941099
## run 29 0.3023984
## run 30 0.3020126
## run 31 0.2983558
## run 32 0.2997775
## run 33 0.2902648
## run 34 0.2993205
## run 35 0.2987137
## run 36 0.2952065
## run 37 0.2994274
## run 38 0.3042761
## run 39 0.2976461
## run 40 0.2993694
## run 41 0.2967565
## run 42 0.2944768
## run 43 0.2941358
## run 44 0.2882492
## run 45 0.2983567
## run 46 0.2942568
## run 47 0.2906855
## run 48 0.2999784
## run 49 0.3022837
## run 50 0.2979496
## find the run with the lowest cross-entropy
lowest.ce <- which.min(ce)
lowest.ce
## [1] 14
## extract Q-matrix for the best run
qmatrix <- as.data.frame(Q(snmf1, K = 9, run = lowest.ce))
head(qmatrix)
## V1 V2 V3 V4 V5 V6
## 1 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05 0.000755320 0.613557
## 2 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01 0.178439000 0.299589
## 3 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02 0.000099973 0.328833
## 4 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05 0.018455600 0.145086
## 5 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05 0.000099964 0.184407
## 6 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05 0.036707400 0.319622
## V7 V8 V9
## 1 9.99639e-05 0.049494200 0.079542400
## 2 1.66984e-02 0.000099973 0.299490000
## 3 9.99730e-05 0.188641000 0.133300000
## 4 9.99730e-05 0.516370000 0.147398000
## 5 9.99640e-05 0.167587000 0.000099964
## 6 9.99640e-05 0.425819000 0.000099964
## changing order of levels
pops <- fly_gen3$pop
levels(pops)
## [1] "Austria" "MI-WI" "Charlottesville" "Denmark"
## [5] "Esparto" "Finland" "France" "Germany"
## [9] "NY-MA" "PA" "Russia" "Spain"
## [13] "Tuolumne" "Turkey" "Ukraine"
levels(pops)[3] <- "Charlott."
qmplot <- cbind(qmatrix, pops)
qmplot$pops <- factor(qmplot$pops, levels = c("Esparto", "Tuolumne", "MI-WI", "Charlott.", "PA", "NY-MA", "Spain", "France", "Germany", "Austria", "Denmark", "Finland", "Ukraine", "Russia", "Turkey"))
qmplot <- qmplot[order(qmplot$pops), ]
pops <- qmplot$pops
## png("admixture_revision.png", width = 7, height = 7, units = "in", res = 300)
## pdf("admixture_revision.pdf")
layout(matrix(c(0, 1, 1, 0,
0, 1, 1, 0,
2, 2, 2, 2,
2, 2, 2, 2), nrow = 4, ncol = 4, byrow = TRUE))
par(mar = c(4, 4, 1, 0))
plot(snmf1, col = "blue", cex = 1.5, pch = 19, las = 1, cex.lab = 1.3)
Arrows(x = 9, y = 0.295, x1 = 9, y1 = 0.305, col = "red", arr.type = "triangle", code = 1, lwd = 1.5, arr.length = 0.2)
mtext("(a)", side = 2, at = 0.342, line = 2.8, font = 2, family = "serif", las = 1)
par(mar = c(5, 4.5, 1, 0))
barplot(t(qmplot[1:9]), col = RColorBrewer::brewer.pal(9,"Paired"), border = NA, space = 0, xlab = "", xaxt = "n", ylab = "Admixture proportion", las = 1, cex.lab = 1.4)
## adding population labels to the axis:
names <- unique(as.character(pops))
medians <- c()
for(i in 1:length(pops)){
axis(1, at = median(which(pops == pops[i])), labels = "")
medians <- c(medians, median(which(pops == pops[i])))
}
medians[1:10]
## [1] 3 3 3 3 3 8 8 8 8 8
text(x = as.numeric(unique(as.character(medians))), y = par("usr")[3] - 0.05, labels = names, xpd = NA, srt = 35, cex = 1, adj = 1.2)
mtext("Pools", side = 1, line = 4)
mtext("(b)", side = 2, at = 1.1, line = 2.4, font = 2, family = "serif", las = 1)
## dev.off()
Figure 2. Population structure analysis based on individual ancestry coefficients for a number of ancestral populations. (a) Cross-entropy values for each snmf run with k ranging between k = 1 and k = 10. The red arrow indicates the most likely value of k. (b) Admixture proportion across populations of D. melanogaster. Tick marks on the x axis correspond to the median number of pools per locality. Colors indicate genetic clusters.
## label column names of qmatrix
ncol(qmatrix)
## [1] 9
cluster_names = c()
for(i in 1:ncol(qmatrix)){
cluster_names[i] = paste("Cluster", i)
}
cluster_names
## [1] "Cluster 1" "Cluster 2" "Cluster 3" "Cluster 4" "Cluster 5" "Cluster 6"
## [7] "Cluster 7" "Cluster 8" "Cluster 9"
colnames(qmatrix) <- cluster_names
head(qmatrix)
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6
## 1 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05 0.000755320 0.613557
## 2 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01 0.178439000 0.299589
## 3 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02 0.000099973 0.328833
## 4 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05 0.018455600 0.145086
## 5 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05 0.000099964 0.184407
## 6 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05 0.036707400 0.319622
## Cluster 7 Cluster 8 Cluster 9
## 1 9.99639e-05 0.049494200 0.079542400
## 2 1.66984e-02 0.000099973 0.299490000
## 3 9.99730e-05 0.188641000 0.133300000
## 4 9.99730e-05 0.516370000 0.147398000
## 5 9.99640e-05 0.167587000 0.000099964
## 6 9.99640e-05 0.425819000 0.000099964
dim(qmatrix)
## [1] 151 9
## add individual IDs
qmatrix$Ind <- indNames(fly_gen3)
## add site IDs
qmatrix$Site <- fly_gen3$pop
head(qmatrix)
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6
## 1 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05 0.000755320 0.613557
## 2 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01 0.178439000 0.299589
## 3 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02 0.000099973 0.328833
## 4 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05 0.018455600 0.145086
## 5 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05 0.000099964 0.184407
## 6 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05 0.036707400 0.319622
## Cluster 7 Cluster 8 Cluster 9 Ind Site
## 1 9.99639e-05 0.049494200 0.079542400 AT_gr_12_fall Austria
## 2 1.66984e-02 0.000099973 0.299490000 AT_gr_12_spring Austria
## 3 9.99730e-05 0.188641000 0.133300000 AT_Mau_14_01 Austria
## 4 9.99730e-05 0.516370000 0.147398000 AT_Mau_14_02 Austria
## 5 9.99640e-05 0.167587000 0.000099964 AT_Mau_15_50 Austria
## 6 9.99640e-05 0.425819000 0.000099964 AT_Mau_15_51 Austria
## calculate mean admixture proportions for each site
clusters <- grep("Cluster", names(qmatrix)) ## indexes of cluster columns
avg_admix <- aggregate(qmatrix[, clusters], list(qmatrix$Site, qmatrix$Ind), mean)
colnames(avg_admix)[1:2] <- c("country", "Ind")
head(avg_admix)
## country Ind Cluster 1 Cluster 2 Cluster 3 Cluster 4
## 1 Austria AT_gr_12_fall 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05
## 2 Austria AT_gr_12_spring 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01
## 3 Austria AT_Mau_14_01 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02
## 4 Austria AT_Mau_14_02 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05
## 5 Austria AT_Mau_15_50 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05
## 6 Austria AT_Mau_15_51 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05
## Cluster 5 Cluster 6 Cluster 7 Cluster 8 Cluster 9
## 1 0.000755320 0.613557 9.99639e-05 0.049494200 0.079542400
## 2 0.178439000 0.299589 1.66984e-02 0.000099973 0.299490000
## 3 0.000099973 0.328833 9.99730e-05 0.188641000 0.133300000
## 4 0.018455600 0.145086 9.99730e-05 0.516370000 0.147398000
## 5 0.000099964 0.184407 9.99640e-05 0.167587000 0.000099964
## 6 0.036707400 0.319622 9.99640e-05 0.425819000 0.000099964
str(avg_admix)
## 'data.frame': 151 obs. of 11 variables:
## $ country : Factor w/ 15 levels "Austria","MI-WI",..: 1 1 1 1 1 1 5 5 5 5 ...
## $ Ind : chr "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
## $ Cluster 1: num 0.0001 0.0001 0.0001 0.0001 0.0133 ...
## $ Cluster 2: num 0.0001 0.0001 0.0779 0.164 0.0437 ...
## $ Cluster 3: num 0.25625 0.06443 0.24535 0.00835 0.5906 ...
## $ Cluster 4: num 0.0001 0.1411 0.0257 0.0001 0.0001 ...
## $ Cluster 5: num 0.000755 0.178439 0.0001 0.018456 0.0001 ...
## $ Cluster 6: num 0.614 0.3 0.329 0.145 0.184 ...
## $ Cluster 7: num 0.0001 0.0167 0.0001 0.0001 0.0001 ...
## $ Cluster 8: num 0.0495 0.0001 0.1886 0.5164 0.1676 ...
## $ Cluster 9: num 0.0795 0.2995 0.1333 0.1474 0.0001 ...
## import csv file containing coordinates
coor <- read.csv("coor.csv")
str(coor)
## 'data.frame': 16 obs. of 3 variables:
## $ country: chr "Austria" "Esparto" "Tuolumne" "Germany" ...
## $ long : num 15.96 -122.05 -120.26 9.71 10.21 ...
## $ lat : num 48.3 38.7 38 48.2 55.9 ...
head(coor)
## country long lat
## 1 Austria 15.965000 48.28750
## 2 Esparto -122.046000 38.68000
## 3 Tuolumne -120.260000 37.97000
## 4 Germany 9.714757 48.19901
## 5 Denmark 10.212586 55.94513
## 6 Spain 1.279236 41.51821
admix <- merge(coor, avg_admix, by = "country")
str(admix)
## 'data.frame': 151 obs. of 13 variables:
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ long : num 16 16 16 16 16 ...
## $ lat : num 48.3 48.3 48.3 48.3 48.3 ...
## $ Ind : chr "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
## $ Cluster 1: num 0.0001 0.0001 0.0001 0.0001 0.0133 ...
## $ Cluster 2: num 0.0001 0.0001 0.0779 0.164 0.0437 ...
## $ Cluster 3: num 0.25625 0.06443 0.24535 0.00835 0.5906 ...
## $ Cluster 4: num 0.0001 0.1411 0.0257 0.0001 0.0001 ...
## $ Cluster 5: num 0.000755 0.178439 0.0001 0.018456 0.0001 ...
## $ Cluster 6: num 0.614 0.3 0.329 0.145 0.184 ...
## $ Cluster 7: num 0.0001 0.0167 0.0001 0.0001 0.0001 ...
## $ Cluster 8: num 0.0495 0.0001 0.1886 0.5164 0.1676 ...
## $ Cluster 9: num 0.0795 0.2995 0.1333 0.1474 0.0001 ...
head(admix)
## country long lat Ind Cluster 1 Cluster 2 Cluster 3
## 1 Austria 15.965 48.2875 AT_gr_12_fall 9.99639e-05 9.99639e-05 0.25625200
## 2 Austria 15.965 48.2875 AT_gr_12_spring 9.99730e-05 9.99730e-05 0.06442830
## 3 Austria 15.965 48.2875 AT_Mau_14_01 9.99730e-05 7.78667e-02 0.24535100
## 4 Austria 15.965 48.2875 AT_Mau_14_02 9.99730e-05 1.64043e-01 0.00834852
## 5 Austria 15.965 48.2875 AT_Mau_15_50 1.32607e-02 4.37458e-02 0.59060000
## 6 Austria 15.965 48.2875 AT_Mau_15_51 9.99640e-05 1.04134e-01 0.11331700
## Cluster 4 Cluster 5 Cluster 6 Cluster 7 Cluster 8 Cluster 9
## 1 9.99639e-05 0.000755320 0.613557 9.99639e-05 0.049494200 0.079542400
## 2 1.41056e-01 0.178439000 0.299589 1.66984e-02 0.000099973 0.299490000
## 3 2.57084e-02 0.000099973 0.328833 9.99730e-05 0.188641000 0.133300000
## 4 9.99730e-05 0.018455600 0.145086 9.99730e-05 0.516370000 0.147398000
## 5 9.99640e-05 0.000099964 0.184407 9.99640e-05 0.167587000 0.000099964
## 6 9.99640e-05 0.036707400 0.319622 9.99640e-05 0.425819000 0.000099964
colnames(admix)[c(5, 6, 7, 8, 9, 10, 11, 12, 13)] <- c("cluster1", "cluster2", "cluster3", "cluster4", "cluster5", "cluster6", "cluster7", "cluster8", "cluster9")
pies <- admix[ , c(1:3, 5:13)]
meanpies <- aggregate(pies[2:12], by = list(pies$country), mean)
Plotting admixture proportions on a map
## Plotting map
## png("map_revision.png", width = 7, height = 7, units = "in", res = 300)
## pdf("map_revision.pdf")
quartz()
layout(matrix(c(1, 1, 1, 1,
1, 1, 1, 1,
2, 2, 2, 2,
2, 2, 2, 2), nrow = 4, ncol = 4, byrow = TRUE))
map("state", fill = TRUE, col = "beige", border = "gray", asp = 1)
map.axes(cex.axis = 1, las = 1)
mtext(side = 1, line = 3, "Longitude")
mtext(side = 2, line = 3, "Latitude")
addCompass(-120, 29, rot = 0, useWest = TRUE, cex = 0.5, col.compass = "gainsboro")
## Adding pie charts to the map
for(i in 1:nrow(meanpies)){
add.pie(z = c(meanpies$cluster1[i], meanpies$cluster2[i], meanpies$cluster3[i], meanpies$cluster4[i], meanpies$cluster5[i], meanpies$cluster6[i], meanpies$cluster7[i], meanpies$cluster8[i], meanpies$cluster9[i]), x = meanpies$long[i], y = meanpies$lat[i], radius = 1.5, col = RColorBrewer::brewer.pal(9,"Paired"), labels = "", border = FALSE)
}
newmap <- getMap(resolution = "low")
plot(newmap, col = "beige", border = "gray", xlim = c(-10, 40), ylim = c(35, 71), asp = 1)
map.axes(cex.axis = 1, las = 1)
mtext(side = 1, line = 3, "Longitude")
mtext(side = 2, line = 3, "Latitude")
addCompass(-16, 40, rot = 0, useWest = TRUE, cex = 0.5, col.compass = "gainsboro")
## Adding pie charts to the map
for(i in 1:nrow(meanpies)){
add.pie(z = c(meanpies$cluster1[i], meanpies$cluster2[i], meanpies$cluster3[i], meanpies$cluster4[i], meanpies$cluster5[i], meanpies$cluster6[i], meanpies$cluster7[i], meanpies$cluster8[i], meanpies$cluster9[i]), x = meanpies$long[i], y = meanpies$lat[i], radius = 2.5, col = RColorBrewer::brewer.pal(9,"Paired"), labels = "", border = FALSE)
}
## dev.off()
Figure 3. Mean admixture proportions across populations of D. melanogaster surveyed in America (upper panel) and Europe (lower panel). Colors in the pie charts represent genetic clusters.
Isolation by Environment and Isolation by Distance analyses
unique(fly_gen3$pop)
## [1] Austria Esparto Tuolumne Germany
## [5] Denmark Spain Finland France
## [9] NY-MA MI-WI PA Russia
## [13] Turkey Ukraine Charlottesville
## 15 Levels: Austria MI-WI Charlottesville Denmark Esparto Finland ... Ukraine
obj <- matrix(NA, nrow = length(unique(fly_gen3$pop)), ncol = 3, byrow = TRUE)
idx <- 1
for(i in unique(fly_gen3$pop)){
long <- admix$long[admix$country == i][1]
lat <- admix$lat[admix$country == i][1]
country <- i
obj[idx, ] <- c(country, long, lat)
idx = idx + 1
}
obj
## [,1] [,2] [,3]
## [1,] "Austria" "15.965" "48.2875"
## [2,] "Esparto" "-122.046" "38.68"
## [3,] "Tuolumne" "-120.26" "37.97"
## [4,] "Germany" "9.7147565" "48.199012"
## [5,] "Denmark" "10.212586" "55.945128"
## [6,] "Spain" "1.279236" "41.518208"
## [7,] "Finland" "23.52" "61.1"
## [8,] "France" "3.5442054" "46.8656662"
## [9,] "NY-MA" "-74.085" "42.445"
## [10,] "MI-WI" "-88.04915" "42.61055"
## [11,] "PA" "-76.6350005" "40.3366975"
## [12,] "Russia" "33.235277" "58.02444"
## [13,] "Turkey" "32.260328" "40.231444"
## [14,] "Ukraine" "30.16599445" "49.47387778"
## [15,] "Charlottesville" "-78.47" "38"
obj <- as.data.frame(obj)
colnames(obj) <- c("country", "x", "y")
obj$x <- as.numeric(obj$x)
obj$y <- as.numeric(obj$y)
rownames(obj) <- obj[ , 1]
obj <- obj[ , c(2, 3)]
str(obj)
## 'data.frame': 15 obs. of 2 variables:
## $ x: num 15.96 -122.05 -120.26 9.71 10.21 ...
## $ y: num 48.3 38.7 38 48.2 55.9 ...
fly_gen3@other <- obj[ , c(1, 2)]
fly_gen3
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
##
## // Basic content
## @tab: 151 x 3100 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3100 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
## @other: a list containing: x y
## genetic distance
GD.pop.PairwiseFst.hierfstat <- as.dist(hierfstat::pairwise.neifst(hierfstat::genind2hierfstat(fly_gen3)))
## geographic distance
Dgeo <- round(dist(fly_gen3$other), 3)
## making sure the order of rows and columns are the same in all of the matrices
ord.gen <- as.matrix(GD.pop.PairwiseFst.hierfstat)
ord.geo <- as.matrix(Dgeo)
ord.geo
## Austria Esparto Tuolumne Germany Denmark Spain Finland
## Austria 0.000 138.345 136.615 6.251 9.578 16.171 14.874
## Esparto 138.345 0.000 1.922 132.104 133.381 123.358 147.282
## Tuolumne 136.615 1.922 0.000 130.377 131.705 121.591 145.629
## Germany 6.251 132.104 130.377 0.000 7.762 10.761 18.895
## Denmark 9.578 133.381 131.705 7.762 0.000 16.969 14.271
## Spain 16.171 123.358 121.591 10.761 16.969 0.000 29.633
## Finland 14.874 147.282 145.629 18.895 14.271 29.633 0.000
## France 12.502 125.857 124.123 6.313 11.265 5.807 24.529
## NY-MA 90.239 48.109 46.391 83.997 85.372 75.370 99.372
## MI-WI 104.169 34.223 32.543 97.924 99.162 89.335 113.091
## PA 92.941 45.441 43.689 86.707 88.239 77.923 102.285
## Russia 19.826 156.482 154.800 25.490 23.116 35.967 10.190
## Turkey 18.178 154.314 152.537 23.912 27.074 31.008 22.625
## Ukraine 14.250 152.594 150.865 20.491 20.977 29.962 13.392
## Charlottesville 94.994 43.581 41.790 88.773 90.480 79.827 104.573
## France NY-MA MI-WI PA Russia Turkey Ukraine
## Austria 12.502 90.239 104.169 92.941 19.826 18.178 14.250
## Esparto 125.857 48.109 34.223 45.441 156.482 154.314 152.594
## Tuolumne 124.123 46.391 32.543 43.689 154.800 152.537 150.865
## Germany 6.313 83.997 97.924 86.707 25.490 23.912 20.491
## Denmark 11.265 85.372 99.162 88.239 23.116 27.074 20.977
## Spain 5.807 75.370 89.335 77.923 35.967 31.008 29.962
## Finland 24.529 99.372 113.091 102.285 10.190 22.625 13.392
## France 0.000 77.755 91.692 80.445 31.719 29.473 26.749
## NY-MA 77.755 0.000 13.965 3.309 108.445 106.368 104.488
## MI-WI 91.692 13.965 0.000 11.638 122.260 120.333 118.414
## PA 80.445 3.309 11.638 0.000 111.285 108.895 107.191
## Russia 31.719 108.445 122.260 111.285 0.000 17.820 9.085
## Turkey 29.473 106.368 120.333 108.895 17.820 0.000 9.477
## Ukraine 26.749 104.488 118.414 107.191 9.085 9.477 0.000
## Charlottesville 82.492 6.244 10.631 2.971 113.486 110.753 109.240
## Charlottesville
## Austria 94.994
## Esparto 43.581
## Tuolumne 41.790
## Germany 88.773
## Denmark 90.480
## Spain 79.827
## Finland 104.573
## France 82.492
## NY-MA 6.244
## MI-WI 10.631
## PA 2.971
## Russia 113.486
## Turkey 110.753
## Ukraine 109.240
## Charlottesville 0.000
ord.gen <- ord.gen[rownames(ord.geo), colnames(ord.geo)]
ord.gen
## Austria Esparto Tuolumne Germany Denmark Spain Finland France
## Austria 0.0000 0.0557 0.0588 0.0128 0.0181 0.0165 0.0151 0.0212
## Esparto 0.0557 0.0000 -0.0013 0.0441 0.0490 0.0446 0.0547 0.0414
## Tuolumne 0.0588 -0.0013 0.0000 0.0468 0.0533 0.0479 0.0601 0.0457
## Germany 0.0128 0.0441 0.0468 0.0000 0.0170 0.0077 0.0136 0.0100
## Denmark 0.0181 0.0490 0.0533 0.0170 0.0000 0.0256 0.0246 0.0186
## Spain 0.0165 0.0446 0.0479 0.0077 0.0256 0.0000 0.0200 0.0158
## Finland 0.0151 0.0547 0.0601 0.0136 0.0246 0.0200 0.0000 0.0222
## France 0.0212 0.0414 0.0457 0.0100 0.0186 0.0158 0.0222 0.0000
## NY-MA 0.0439 0.0089 0.0082 0.0335 0.0403 0.0295 0.0435 0.0334
## MI-WI 0.0540 0.0130 0.0094 0.0412 0.0501 0.0396 0.0492 0.0422
## PA 0.0541 0.0141 0.0099 0.0414 0.0519 0.0393 0.0521 0.0440
## Russia 0.0169 0.0535 0.0605 0.0210 0.0234 0.0318 0.0192 0.0222
## Turkey 0.0139 0.0600 0.0618 0.0188 0.0217 0.0264 0.0163 0.0280
## Ukraine 0.0137 0.0627 0.0674 0.0196 0.0234 0.0308 0.0214 0.0281
## Charlottesville 0.0582 0.0172 0.0153 0.0466 0.0541 0.0466 0.0574 0.0448
## NY-MA MI-WI PA Russia Turkey Ukraine Charlottesville
## Austria 0.0439 0.0540 0.0541 0.0169 0.0139 0.0137 0.0582
## Esparto 0.0089 0.0130 0.0141 0.0535 0.0600 0.0627 0.0172
## Tuolumne 0.0082 0.0094 0.0099 0.0605 0.0618 0.0674 0.0153
## Germany 0.0335 0.0412 0.0414 0.0210 0.0188 0.0196 0.0466
## Denmark 0.0403 0.0501 0.0519 0.0234 0.0217 0.0234 0.0541
## Spain 0.0295 0.0396 0.0393 0.0318 0.0264 0.0308 0.0466
## Finland 0.0435 0.0492 0.0521 0.0192 0.0163 0.0214 0.0574
## France 0.0334 0.0422 0.0440 0.0222 0.0280 0.0281 0.0448
## NY-MA 0.0000 0.0057 0.0017 0.0474 0.0482 0.0535 0.0031
## MI-WI 0.0057 0.0000 0.0042 0.0537 0.0537 0.0616 0.0049
## PA 0.0017 0.0042 0.0000 0.0596 0.0564 0.0646 0.0045
## Russia 0.0474 0.0537 0.0596 0.0000 0.0167 0.0067 0.0597
## Turkey 0.0482 0.0537 0.0564 0.0167 0.0000 0.0129 0.0574
## Ukraine 0.0535 0.0616 0.0646 0.0067 0.0129 0.0000 0.0656
## Charlottesville 0.0031 0.0049 0.0045 0.0597 0.0574 0.0656 0.0000
fst.dist <- round(as.dist(ord.gen), 3)
fst.dist
## Austria Esparto Tuolumne Germany Denmark Spain Finland France
## Esparto 0.056
## Tuolumne 0.059 -0.001
## Germany 0.013 0.044 0.047
## Denmark 0.018 0.049 0.053 0.017
## Spain 0.016 0.045 0.048 0.008 0.026
## Finland 0.015 0.055 0.060 0.014 0.025 0.020
## France 0.021 0.041 0.046 0.010 0.019 0.016 0.022
## NY-MA 0.044 0.009 0.008 0.034 0.040 0.029 0.044 0.033
## MI-WI 0.054 0.013 0.009 0.041 0.050 0.040 0.049 0.042
## PA 0.054 0.014 0.010 0.041 0.052 0.039 0.052 0.044
## Russia 0.017 0.054 0.060 0.021 0.023 0.032 0.019 0.022
## Turkey 0.014 0.060 0.062 0.019 0.022 0.026 0.016 0.028
## Ukraine 0.014 0.063 0.067 0.020 0.023 0.031 0.021 0.028
## Charlottesville 0.058 0.017 0.015 0.047 0.054 0.047 0.057 0.045
## NY-MA MI-WI PA Russia Turkey Ukraine
## Esparto
## Tuolumne
## Germany
## Denmark
## Spain
## Finland
## France
## NY-MA
## MI-WI 0.006
## PA 0.002 0.004
## Russia 0.047 0.054 0.060
## Turkey 0.048 0.054 0.056 0.017
## Ukraine 0.054 0.062 0.065 0.007 0.013
## Charlottesville 0.003 0.005 0.004 0.060 0.057 0.066
Dgeo
## Austria Esparto Tuolumne Germany Denmark Spain Finland
## Esparto 138.345
## Tuolumne 136.615 1.922
## Germany 6.251 132.104 130.377
## Denmark 9.578 133.381 131.705 7.762
## Spain 16.171 123.358 121.591 10.761 16.969
## Finland 14.874 147.282 145.629 18.895 14.271 29.633
## France 12.502 125.857 124.123 6.313 11.265 5.807 24.529
## NY-MA 90.239 48.109 46.391 83.997 85.372 75.370 99.372
## MI-WI 104.169 34.223 32.543 97.924 99.162 89.335 113.091
## PA 92.941 45.441 43.689 86.707 88.239 77.923 102.285
## Russia 19.826 156.482 154.800 25.490 23.116 35.967 10.190
## Turkey 18.178 154.314 152.537 23.912 27.074 31.008 22.625
## Ukraine 14.250 152.594 150.865 20.491 20.977 29.962 13.392
## Charlottesville 94.994 43.581 41.790 88.773 90.480 79.827 104.573
## France NY-MA MI-WI PA Russia Turkey Ukraine
## Esparto
## Tuolumne
## Germany
## Denmark
## Spain
## Finland
## France
## NY-MA 77.755
## MI-WI 91.692 13.965
## PA 80.445 3.309 11.638
## Russia 31.719 108.445 122.260 111.285
## Turkey 29.473 106.368 120.333 108.895 17.820
## Ukraine 26.749 104.488 118.414 107.191 9.085 9.477
## Charlottesville 82.492 6.244 10.631 2.971 113.486 110.753 109.240
NPP_file <- list.files("/Users/dpadil10/Dropbox (ASU)/npp-geotiff", full.names = TRUE)
NPP_file
## [1] "/Users/dpadil10/Dropbox (ASU)/npp-geotiff/npp_geotiff.tif"
NPP.data <- stack(NPP_file)
NPP.data.ext.dis <- extract(NPP.data, obj)
head(NPP.data.ext.dis)
## npp_geotiff
## [1,] 264522317824
## [2,] 391191265280
## [3,] 367969927168
## [4,] 348927459328
## [5,] 25091205120
## [6,] 268916719616
rownames(NPP.data.ext.dis) <- rownames(obj)
DNPP <- round(dist(log(NPP.data.ext.dis)), 3)
DNPP
## Austria Esparto Tuolumne Germany Denmark Spain Finland France
## Esparto 0.391
## Tuolumne 0.330 0.061
## Germany 0.277 0.114 0.053
## Denmark 2.355 2.747 2.685 2.632
## Spain 0.016 0.375 0.314 0.260 2.372
## Finland 0.610 1.001 0.940 0.886 1.746 0.626
## France 0.577 0.185 0.247 0.300 2.932 0.560 1.186
## NY-MA 0.342 0.050 0.011 0.065 2.697 0.325 0.951 0.235
## MI-WI 0.236 0.155 0.094 0.041 2.592 0.220 0.846 0.340
## PA 0.452 0.061 0.122 0.175 2.808 0.436 1.062 0.124
## Russia 0.549 0.940 0.879 0.826 1.806 0.565 0.061 1.126
## Turkey 0.626 1.017 0.956 0.903 1.729 0.643 0.017 1.203
## Ukraine 0.075 0.466 0.405 0.352 2.280 0.092 0.534 0.652
## Charlottesville 0.526 0.134 0.196 0.249 2.881 0.509 1.135 0.051
## NY-MA MI-WI PA Russia Turkey Ukraine
## Esparto
## Tuolumne
## Germany
## Denmark
## Spain
## Finland
## France
## NY-MA
## MI-WI 0.105
## PA 0.111 0.216
## Russia 0.891 0.785 1.001
## Turkey 0.968 0.862 1.078 0.077
## Ukraine 0.417 0.311 0.527 0.474 0.551
## Charlottesville 0.184 0.290 0.074 1.075 1.152 0.601
## R script for 'MMRR' function that performs Multiple Matrix Regression with Randomization analysis.
source("MMRR.R")
distances <- list(Dgeo = as.matrix(Dgeo), DNPP = as.matrix(DNPP))
MMRR(as.matrix(fst.dist), distances)
## $r.squared
## r.squared
## 0.86385
##
## $coefficients
## Intercept Dgeo DNPP
## 0.0072219678 0.0003551048 0.0036128387
##
## $tstatistic
## Intercept(t) Dgeo(t) DNPP(t)
## 5.380919 24.813658 3.906555
##
## $tpvalue
## Intercept(p) Dgeo(p) DNPP(p)
## 0.999 0.001 0.001
##
## $Fstatistic
## F-statistic
## 323.5867
##
## $Fpvalue
## F p-value
## 0.001
## partial mantel test
part.mant <- mantel.partial(fst.dist, DNPP, Dgeo, method = "pearson", permutations = 1000)
part.mant
##
## Partial Mantel statistic based on Pearson's product-moment correlation
##
## Call:
## mantel.partial(xdis = fst.dist, ydis = DNPP, zdis = Dgeo, method = "pearson", permutations = 1000)
##
## Mantel statistic r: 0.3608
## Significance: 0.001998
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.0912 0.1217 0.1531 0.1984
## Permutation: free
## Number of permutations: 1000
part.mant2 <- mantel.partial(fst.dist, Dgeo, DNPP, method = "pearson", permutations = 1000)
part.mant2
##
## Partial Mantel statistic based on Pearson's product-moment correlation
##
## Call:
## mantel.partial(xdis = fst.dist, ydis = Dgeo, zdis = DNPP, method = "pearson", permutations = 1000)
##
## Mantel statistic r: 0.9262
## Significance: 0.000999
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.101 0.167 0.223 0.338
## Permutation: free
## Number of permutations: 1000
## mantel test
ibd <- mantel.randtest(fst.dist, Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = fst.dist, m2 = Dgeo)
##
## Observation: 0.9184113
##
## Based on 999 replicates
## Simulated p-value: 0.001
## Alternative hypothesis: greater
##
## Std.Obs Expectation Variance
## 9.173596292 0.003686283 0.009942643
quartz()
plot(ibd, las = 1, main = "") ## isolation by distance is clearly significant
mantCor <- mgram(fst.dist, Dgeo, nperm = 1000)
mantCor
## $mgram
## lag ngroup mantelr pval llim ulim
## [1,] 9.66 27 0.64245382 0.001 5.918708e-01 0.69524902
## [2,] 28.98 18 0.28493265 0.014 2.262106e-01 0.34487195
## [3,] 48.30 6 0.27711014 0.015 1.829388e-01 0.34511218
## [4,] 67.62 1 0.02464136 0.968 -5.108269e-17 0.03538879
## [5,] 86.94 16 -0.23755747 0.022 -3.308312e-01 -0.16036366
## [6,] 106.26 16 -0.43851519 0.001 -5.317588e-01 -0.36207546
## [7,] 125.58 11 -0.27585827 0.016 -3.384386e-01 -0.17825522
## [8,] 144.90 8 -0.39397231 0.002 -4.881426e-01 -0.28195888
##
## $resids
## [1] NA
##
## attr(,"class")
## [1] "mgram"
mantCor2 <- mgram(fst.dist, DNPP, nperm = 1000)
mantCor2
## $mgram
## lag ngroup mantelr pval llim ulim
## [1,] 0.1825625 44 0.36681725 0.001 2.370974e-01 4.910210e-01
## [2,] 0.5476875 23 -0.07860655 0.336 -2.271801e-01 6.511911e-02
## [3,] 0.9128125 19 -0.36758903 0.005 -4.938220e-01 -2.405678e-01
## [4,] 1.2779375 5 -0.03904606 0.654 -1.860046e-01 1.023411e-01
## [5,] 1.6430625 3 0.09357366 0.308 -2.944606e-17 1.305096e-01
## [6,] 2.0081875 0 0.00000000 1.000 0.000000e+00 0.000000e+00
## [7,] 2.3733125 3 0.10248140 0.260 -1.957163e-17 1.362118e-01
## [8,] 2.7384375 7 -0.15494628 0.064 -2.108242e-01 7.556146e-18
##
## $resids
## [1] NA
##
## attr(,"class")
## [1] "mgram"
plot(mantCor, las = 1)
plot(mantCor2, las = 1)
## filled dots indicate significant correlations. The Mantel correlogram shows that genotypes are relatively similar at short distance, while this similarity decreases with distance. The negative correlations indicate genetic differenciation between some populations
dens <- MASS::kde2d(as.vector(Dgeo), as.vector(fst.dist), n = 300)
myPal <- colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(as.vector(Dgeo), as.vector(fst.dist), pch = 20, xlab = "Geographic Distance", ylab = "Genetic Distance", las = 1)
image(dens, col = transp(myPal(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), lwd = 2)
lines(loess.smooth(as.vector(Dgeo), as.vector(fst.dist)), col = "red", lwd = 2)
dens2 <- MASS::kde2d(as.vector(DNPP), as.vector(fst.dist), n = 300)
myPal2 <- colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(as.vector(DNPP), as.vector(fst.dist), pch = 20, xlab = "NPP Distance", ylab = "Genetic Distance", las = 1)
image(dens2, col = transp(myPal2(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(DNPP)), lwd = 2)
lines(loess.smooth(as.vector(DNPP), as.vector(fst.dist)), col = "red", lwd = 2)
## png("distance.png", width = 7, height = 7, units = "in", res = 300)
## pdf("distance.pdf")
layout(matrix(c(1, 1, 2, 2,
1, 1, 2, 2,
3, 3, 4, 4,
3, 3, 4, 4), nrow = 4, ncol = 4, byrow = TRUE))
par(mar = c(4.2, 4.1, 1, 1))
plot(as.vector(Dgeo), as.vector(fst.dist), pch = 20, xlab = "Geographic Distance", ylab = "Genetic Distance", las = 1, cex.lab = 1.4)
image(dens, col = transp(myPal(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), lwd = 2)
lines(loess.smooth(as.vector(Dgeo), as.vector(fst.dist)), col = "red", lwd = 2)
mtext("(a)", side = 2, at = 0.07, line = 2.6, font = 2, family = "serif", las = 1)
par(mar = c(4, 4.1, 1, 1))
plot(as.vector(DNPP), as.vector(fst.dist), pch = 20, xlab = "NPP Distance", ylab = "Genetic Distance", las = 1, cex.lab = 1.4)
image(dens2, col = transp(myPal2(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(DNPP)), lwd = 2)
lines(loess.smooth(as.vector(DNPP), as.vector(fst.dist)), col = "red", lwd = 2)
mtext("(b)", side = 2, at = 0.07, line = 3.1, font = 2, family = "serif", las = 1)
par(mar = c(4.2, 4.1, 1, 1))
plot(mantCor, las = 1, cex.axis = 1, cex.lab = 1.6, xlab = "Geographic Distance")
mtext("(c)", side = 2, at = 0.7, line = 2.6, font = 2, family = "serif", las = 1)
par(mar = c(4, 4.1, 1, 1))
plot(mantCor2, las = 1, cex.axis = 1, cex.lab = 1.6, ylim = c(-0.4, 0.6), xlab = "NPP Distance")
mtext("(d)", side = 2, at = 0.65, line = 3.1, font = 2, family = "serif", las = 1)
## dev.off()
Figure 6. Patterns of isolation by distance and isolation by environment according to Partial Mantel tests and Correlograms. (a-b) Geographic and net primary production distances as a function of genetic distance. Colors represent estimated probability densities and the red line a smoothed local mean. (c-d) Mantel correlation at different distance classes as a function of geographic and net primary production distances. Filled dots indicate significant correlations.
Analysis of seasonlity and adaptation test
## extracting environmental variables from WorldClim and other sources
mat_adap <- admix[ , c(1, 2, 3, 4)]
head(mat_adap)
## country long lat Ind
## 1 Austria 15.965 48.2875 AT_gr_12_fall
## 2 Austria 15.965 48.2875 AT_gr_12_spring
## 3 Austria 15.965 48.2875 AT_Mau_14_01
## 4 Austria 15.965 48.2875 AT_Mau_14_02
## 5 Austria 15.965 48.2875 AT_Mau_15_50
## 6 Austria 15.965 48.2875 AT_Mau_15_51
mat_adap$season <- rep(NA, nrow(mat_adap))
idx <- 1
for(i in mat_adap$Ind){
obj <- unique(afis$season[afis$sampleId == i])
mat_adap[idx, 5] <- obj
idx = idx + 1
}
mat_adap$season[mat_adap$season == "frost"] <- "fall"
head(mat_adap)
## country long lat Ind season
## 1 Austria 15.965 48.2875 AT_gr_12_fall fall
## 2 Austria 15.965 48.2875 AT_gr_12_spring spring
## 3 Austria 15.965 48.2875 AT_Mau_14_01 spring
## 4 Austria 15.965 48.2875 AT_Mau_14_02 fall
## 5 Austria 15.965 48.2875 AT_Mau_15_50 spring
## 6 Austria 15.965 48.2875 AT_Mau_15_51 fall
## extracting NPP
NPP_file <- list.files("/Users/dpadil10/Dropbox (ASU)/npp-geotiff", full.names = TRUE)
NPP_file
## [1] "/Users/dpadil10/Dropbox (ASU)/npp-geotiff/npp_geotiff.tif"
NPP.data <- stack(NPP_file)
prod <- mat_adap[ , c(2, 3)] ## loading the dataframe with the coordinates
head(prod)
## long lat
## 1 15.965 48.2875
## 2 15.965 48.2875
## 3 15.965 48.2875
## 4 15.965 48.2875
## 5 15.965 48.2875
## 6 15.965 48.2875
NPP.data.ext <- extract(NPP.data, prod)
head(NPP.data.ext)
## npp_geotiff
## [1,] 264522317824
## [2,] 264522317824
## [3,] 264522317824
## [4,] 264522317824
## [5,] 264522317824
## [6,] 264522317824
NPP.data.ext <- as.data.frame(NPP.data.ext)
colnames(NPP.data.ext) <- "NPP"
head(NPP.data.ext)
## NPP
## 1 264522317824
## 2 264522317824
## 3 264522317824
## 4 264522317824
## 5 264522317824
## 6 264522317824
##is.na(NPP.data.ext)
## extracting BIO15 precipitation seasonality (coefficient of variation), BIO4 temperature seasonality (standard deviation * 100)
files <- list.files("/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio", full.names = TRUE)
files
## [1] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_10.tif"
## [2] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_11.tif"
## [3] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_14.tif"
## [4] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_15.tif"
## [5] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_16.tif"
## [6] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_18.tif"
## [7] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_19.tif"
## [8] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_2.tif"
## [9] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_3.tif"
## [10] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_4.tif"
## [11] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_5.tif"
## [12] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_6.tif"
## [13] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_8.tif"
## [14] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_9.tif"
pre_sea <- stack(files[4])
tem_sea <- stack(files[10])
coord <- mat_adap[ , c(2, 3)]
head(coord)
## long lat
## 1 15.965 48.2875
## 2 15.965 48.2875
## 3 15.965 48.2875
## 4 15.965 48.2875
## 5 15.965 48.2875
## 6 15.965 48.2875
prec.season <- extract(pre_sea, coord)
temp.season <- extract(tem_sea, coord)
variables <- cbind(prec.season, temp.season, log(NPP.data.ext))
head(variables)
## wc2.1_30s_bio_15 wc2.1_30s_bio_4 NPP
## 1 37.24262 744.4373 26.30119
## 2 37.24262 744.4373 26.30119
## 3 37.24262 744.4373 26.30119
## 4 37.24262 744.4373 26.30119
## 5 37.24262 744.4373 26.30119
## 6 37.24262 744.4373 26.30119
colnames(variables) <- c("precseason", "tempseason", "logNPP")
head(variables)
## precseason tempseason logNPP
## 1 37.24262 744.4373 26.30119
## 2 37.24262 744.4373 26.30119
## 3 37.24262 744.4373 26.30119
## 4 37.24262 744.4373 26.30119
## 5 37.24262 744.4373 26.30119
## 6 37.24262 744.4373 26.30119
env <- cbind(mat_adap, variables)
head(env)
## country long lat Ind season precseason tempseason logNPP
## 1 Austria 15.965 48.2875 AT_gr_12_fall fall 37.24262 744.4373 26.30119
## 2 Austria 15.965 48.2875 AT_gr_12_spring spring 37.24262 744.4373 26.30119
## 3 Austria 15.965 48.2875 AT_Mau_14_01 spring 37.24262 744.4373 26.30119
## 4 Austria 15.965 48.2875 AT_Mau_14_02 fall 37.24262 744.4373 26.30119
## 5 Austria 15.965 48.2875 AT_Mau_15_50 spring 37.24262 744.4373 26.30119
## 6 Austria 15.965 48.2875 AT_Mau_15_51 fall 37.24262 744.4373 26.30119
## PCA of environmental variables
x <- tab(fly_gen3, NA.method = "mean")
snppca <- as.data.frame(x)
snppca$Ind <- rownames(x)
snppca$country <- fly_gen3$pop
dim(snppca)
## [1] 151 3102
gen <- snppca[order(snppca$country), ]
merging <- merge(gen, env, by = "Ind")
merging[1:5, 1:5]
## Ind 162333.A 162333.C 162335.C 162335.T
## 1 AT_gr_12_fall 1 1 1 1
## 2 AT_gr_12_spring 1 1 1 1
## 3 AT_Mau_14_01 0 2 1 1
## 4 AT_Mau_14_02 1 1 1 1
## 5 AT_Mau_15_50 1 1 1 1
rownames(merging) <- merging[ , 1]
dim(merging)
## [1] 151 3109
merging[1:5, 3101:3109]
## 163997.T country.x country.y long lat season precseason
## AT_gr_12_fall 0 Austria Austria 15.965 48.2875 fall 37.24262
## AT_gr_12_spring 0 Austria Austria 15.965 48.2875 spring 37.24262
## AT_Mau_14_01 0 Austria Austria 15.965 48.2875 spring 37.24262
## AT_Mau_14_02 1 Austria Austria 15.965 48.2875 fall 37.24262
## AT_Mau_15_50 0 Austria Austria 15.965 48.2875 spring 37.24262
## tempseason logNPP
## AT_gr_12_fall 744.4373 26.30119
## AT_gr_12_spring 744.4373 26.30119
## AT_Mau_14_01 744.4373 26.30119
## AT_Mau_14_02 744.4373 26.30119
## AT_Mau_15_50 744.4373 26.30119
merging <- merging[ , -c(1, 3102:3109)]
dim(merging)
## [1] 151 3100
merging[1:5, 3090:3100]
## 163992.G 163993.T 163993.C 163994.G 163994.A 163995.G 163995.A
## AT_gr_12_fall 0 1 1 1 1 2 0
## AT_gr_12_spring 0 1 1 1 1 2 0
## AT_Mau_14_01 1 1 1 1 1 2 0
## AT_Mau_14_02 0 1 1 1 1 2 0
## AT_Mau_15_50 1 1 1 1 1 2 0
## 163996.G 163996.C 163997.A 163997.T
## AT_gr_12_fall 1 1 2 0
## AT_gr_12_spring 1 1 2 0
## AT_Mau_14_01 1 1 2 0
## AT_Mau_14_02 1 1 1 1
## AT_Mau_15_50 1 1 2 0
genomic <- merging
env$Ind <- as.factor(env$Ind)
env$Ind <- as.character(levels(env$Ind))
dim(snppca)
## [1] 151 3102
snppca[1:5, 1:5]
## 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall 1 1 1 1 2
## AT_gr_12_spring 1 1 1 1 2
## AT_Mau_14_01 0 2 1 1 1
## AT_Mau_14_02 1 1 1 1 2
## AT_Mau_15_50 1 1 1 1 2
snppca[1:5, 3090:3102]
## 163992.G 163993.T 163993.C 163994.G 163994.A 163995.G 163995.A
## AT_gr_12_fall 0 1 1 1 1 2 0
## AT_gr_12_spring 0 1 1 1 1 2 0
## AT_Mau_14_01 1 1 1 1 1 2 0
## AT_Mau_14_02 0 1 1 1 1 2 0
## AT_Mau_15_50 1 1 1 1 1 2 0
## 163996.G 163996.C 163997.A 163997.T Ind country
## AT_gr_12_fall 1 1 2 0 AT_gr_12_fall Austria
## AT_gr_12_spring 1 1 2 0 AT_gr_12_spring Austria
## AT_Mau_14_01 1 1 2 0 AT_Mau_14_01 Austria
## AT_Mau_14_02 1 1 1 1 AT_Mau_14_02 Austria
## AT_Mau_15_50 1 1 2 0 AT_Mau_15_50 Austria
snppca <- snppca[ , -c(3101, 3102)]
dim(snppca)
## [1] 151 3100
snppca[1:5, 3090:3100]
## 163992.G 163993.T 163993.C 163994.G 163994.A 163995.G 163995.A
## AT_gr_12_fall 0 1 1 1 1 2 0
## AT_gr_12_spring 0 1 1 1 1 2 0
## AT_Mau_14_01 1 1 1 1 1 2 0
## AT_Mau_14_02 0 1 1 1 1 2 0
## AT_Mau_15_50 1 1 1 1 1 2 0
## 163996.G 163996.C 163997.A 163997.T
## AT_gr_12_fall 1 1 2 0
## AT_gr_12_spring 1 1 2 0
## AT_Mau_14_01 1 1 2 0
## AT_Mau_14_02 1 1 1 1
## AT_Mau_15_50 1 1 2 0
identical(rownames(genomic), env[ , 4]) ## genotypes and environmental data are in the same order
## [1] TRUE
## How many seasons per country are there?
table(data.frame(season = env$season, country = env$country))
## country
## season Austria Charlottesville Denmark Esparto Finland France Germany MI-WI
## fall 3 3 4 3 2 4 6 5
## spring 3 3 1 2 4 6 6 4
## country
## season NY-MA PA Russia Spain Tuolumne Turkey Ukraine
## fall 3 9 3 4 4 4 14
## spring 3 8 3 3 1 9 24
xtable(table(data.frame(season = env$season, country = env$country)), caption = "caption here")
## % latex table generated in R 4.2.2 by xtable 1.8-4 package
## % Tue Sep 5 00:01:23 2023
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrrrrrrrrrrrr}
## \hline
## & Austria & Charlottesville & Denmark & Esparto & Finland & France & Germany & MI-WI & NY-MA & PA & Russia & Spain & Tuolumne & Turkey & Ukraine \\
## \hline
## fall & 3 & 3 & 4 & 3 & 2 & 4 & 6 & 5 & 3 & 9 & 3 & 4 & 4 & 4 & 14 \\
## spring & 3 & 3 & 1 & 2 & 4 & 6 & 6 & 4 & 3 & 8 & 3 & 3 & 1 & 9 & 24 \\
## \hline
## \end{tabular}
## \caption{caption here}
## \end{table}
names(env)
## [1] "country" "long" "lat" "Ind" "season"
## [6] "precseason" "tempseason" "logNPP"
## CCA
gen.cca <- cca(genomic ~ season*country, env)
gen.cca
## Call: cca(formula = genomic ~ season * country, data = env)
##
## Inertia Proportion Rank
## Total 0.21786 1.00000
## Constrained 0.06199 0.28453 29
## Unconstrained 0.15587 0.71547 121
## Inertia is scaled Chi-square
##
## Eigenvalues for constrained axes:
## CCA1 CCA2 CCA3 CCA4 CCA5 CCA6 CCA7 CCA8
## 0.016564 0.005943 0.003270 0.003129 0.002648 0.002531 0.002046 0.001977
## CCA9 CCA10 CCA11 CCA12 CCA13 CCA14 CCA15 CCA16
## 0.001709 0.001637 0.001583 0.001509 0.001448 0.001341 0.001248 0.001224
## CCA17 CCA18 CCA19 CCA20 CCA21 CCA22 CCA23 CCA24
## 0.001160 0.001097 0.001069 0.001030 0.000996 0.000987 0.000961 0.000899
## CCA25 CCA26 CCA27 CCA28 CCA29
## 0.000855 0.000828 0.000826 0.000780 0.000692
##
## Eigenvalues for unconstrained axes:
## CA1 CA2 CA3 CA4 CA5 CA6 CA7 CA8
## 0.014281 0.005240 0.003714 0.003412 0.003133 0.002981 0.002858 0.002616
## (Showing 8 of 121 unconstrained eigenvalues)
anova(gen.cca)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = genomic ~ season * country, data = env)
## Df ChiSquare F Pr(>F)
## Model 29 0.061989 1.6593 0.001 ***
## Residual 121 0.155874
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
env$country <- as.factor(env$country)
levels(env$country)
## [1] "Austria" "Charlottesville" "Denmark" "Esparto"
## [5] "Finland" "France" "Germany" "MI-WI"
## [9] "NY-MA" "PA" "Russia" "Spain"
## [13] "Tuolumne" "Turkey" "Ukraine"
cols <- env$country
cols <- as.numeric(cols)
ncol <- nPop(fly_gen3)
palette <- distinctColorPalette(ncol)
colors <- rep()
for(i in cols){
col <- palette[i]
colors <- c(colors, col)
}
colors
## [1] "#8967DC" "#8967DC" "#8967DC" "#8967DC" "#8967DC" "#8967DC" "#728FD5"
## [8] "#728FD5" "#728FD5" "#728FD5" "#728FD5" "#728FD5" "#83C2D9" "#83C2D9"
## [15] "#83C2D9" "#83C2D9" "#83C2D9" "#DAB08F" "#DAB08F" "#DAB08F" "#DAB08F"
## [22] "#DAB08F" "#BC3FE2" "#BC3FE2" "#BC3FE2" "#BC3FE2" "#BC3FE2" "#BC3FE2"
## [29] "#DF6863" "#DF6863" "#DF6863" "#DF6863" "#DF6863" "#DF6863" "#DF6863"
## [36] "#DF6863" "#DF6863" "#DF6863" "#85967A" "#85967A" "#85967A" "#85967A"
## [43] "#85967A" "#85967A" "#85967A" "#85967A" "#85967A" "#85967A" "#85967A"
## [50] "#85967A" "#B1E750" "#B1E750" "#B1E750" "#B1E750" "#B1E750" "#B1E750"
## [57] "#B1E750" "#B1E750" "#B1E750" "#6EE189" "#6EE189" "#6EE189" "#6EE189"
## [64] "#6EE189" "#6EE189" "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C"
## [71] "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C"
## [78] "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C" "#E2BC4D" "#E2BC4D"
## [85] "#E2BC4D" "#E2BC4D" "#E2BC4D" "#E2BC4D" "#D8DFD5" "#D8DFD5" "#D8DFD5"
## [92] "#D8DFD5" "#D8DFD5" "#D8DFD5" "#D8DFD5" "#D7ABD2" "#D7ABD2" "#D7ABD2"
## [99] "#D7ABD2" "#D7ABD2" "#7BE6D2" "#7BE6D2" "#7BE6D2" "#7BE6D2" "#7BE6D2"
## [106] "#7BE6D2" "#7BE6D2" "#7BE6D2" "#7BE6D2" "#7BE6D2" "#7BE6D2" "#7BE6D2"
## [113] "#7BE6D2" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0"
## [120] "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0"
## [127] "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0"
## [134] "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0"
## [141] "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0"
## [148] "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0"
## png("cca_revision.png", width = 7, height = 7, units = "in", res = 300)
## pdf("cca_revision.pdf")
plot(gen.cca, display = "sites", type = "n", las = 1)
with(gen.cca, points(gen.cca, display = "sites", pch = 21, bg = colors, col = colors))
##with(env[3], ordiellipse(gen.cca, env$country, kind = "se", conf = 0.95, col = "gray"))
##with(env[3], ordihull(gen.cca, env$country, col = colors, lty = 2))
with(env[3], ordispider(gen.cca, env$country, col = unique(colors), label = TRUE))
##legend("topright", legend = levels(env$country), pch = 19, col = unique(cols), bty = "n", cex = 0.8)
## dev.off()
Figure 4. Constrained correspondence analysis depicting the difference in allele frequency across populations mediated by the interaction between season and locality. Filled dots represent pooled samples and “spider” diagrams connect the pools to the population centroid.
## Detecting selection
load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/fly.pv1.RData")
load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/fly.qv1.RData")
fly.pv1.chr2 <- fly.pv1
fly.qv1.chr2 <- fly.qv1
rm(fly.pv1)
rm(fly.qv1)
head(env)
## country long lat Ind season precseason tempseason logNPP
## 1 Austria 15.965 48.2875 AT_gr_12_fall fall 37.24262 744.4373 26.30119
## 2 Austria 15.965 48.2875 AT_gr_12_spring spring 37.24262 744.4373 26.30119
## 3 Austria 15.965 48.2875 AT_Mau_14_01 spring 37.24262 744.4373 26.30119
## 4 Austria 15.965 48.2875 AT_Mau_14_02 fall 37.24262 744.4373 26.30119
## 5 Austria 15.965 48.2875 AT_Mau_15_50 spring 37.24262 744.4373 26.30119
## 6 Austria 15.965 48.2875 AT_Mau_15_51 fall 37.24262 744.4373 26.30119
##env.pca <- rda(env[ , c(2, 3, 6, 7, 8)], scale = T)
env.pca <- rda(env[ , c(6, 7, 8)], scale = T)
summary(env.pca)$cont
## $importance
## Importance of components:
## PC1 PC2 PC3
## Eigenvalue 1.2227 0.9646 0.8127
## Proportion Explained 0.4076 0.3215 0.2709
## Cumulative Proportion 0.4076 0.7291 1.0000
par(las = 1)
screeplot(env.pca, bstick = TRUE, type = "lines", las = 1, main = "")
round(scores(env.pca, choices = 1:3, display = "species", scaling = 0), digits = 3)
## PC1 PC2 PC3
## precseason -0.655 0.273 -0.704
## tempseason 0.658 -0.251 -0.710
## logNPP 0.371 0.929 0.016
## attr(,"const")
## [1] 4.605779
pred.PC2 <- scores(env.pca, choices = 2, display = "sites", scaling = 0)
K <- 9
fly.lfmm1 <- lfmm_ridge(Y = snppca, X = pred.PC2, K = K) ## change K as you see fit
fly.pv1 <- lfmm_test(Y = snppca, X = pred.PC2, lfmm = fly.lfmm1, calibrate = "gif")
names(fly.pv1) # this object includes raw z-scores and p-values, as well as GIF-calibrated scores and p-values
## [1] "B" "epsilon.sigma2" "B.sigma2"
## [4] "score" "pvalue" "gif"
## [7] "calibrated.score2" "calibrated.pvalue"
## let’s look at the genomic inflation factor (GIF)
fly.pv1$gif
## PC2
## 1.359424
fly.pv1.chr2$gif
## PC2
## 1.239521
## an appropriately calibrated set of tests will have a GIF of around 1
## let’s look at how application of the GIF to the p-values impacts the p-value distribution
quartz()
layout(matrix(c(1, 1, 2, 2,
1, 1, 2, 2,
3, 3, 4, 4,
3, 3, 4, 4), nrow = 4, ncol = 4, byrow = FALSE))
par(mar = c(5, 5, 1, 1), las = 1)
hist(fly.pv1$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "for", ylim = c(0, 350))
legend("top", "K = 9", bty = "n")
hist(fly.pv1$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 400))
legend("top", "K = 9", bty = "n")
hist(fly.pv1.chr2$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "Chr 2L", ylim = c(0, 150000))
legend("top", "K = 8", bty = "n")
hist(fly.pv1.chr2$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 150000))
legend("top", "K = 8", bty = "n")
## convert the adjusted p-values to q-values. q-values provide a measure of each SNP’s significance, automatically taking into account the fact that thousands are simultaneously being tested. I can then use an FDR threshold to control the number of false positive detections (given that the p-value distribution is “well-behaved”)
fly.qv1 <- qvalue(fly.pv1$calibrated.pvalue)$qvalues
length(which(fly.qv1 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 4
fly.qv1.chr2 <- qvalue(fly.pv1.chr2$calibrated.pvalue)$qvalues
length(which(fly.qv1.chr2 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 142
## using K = 9, the default GIF correction, and an FDR threshold of 0.05, I detected 2 candidate SNPs under selection in response to the PC2 environmental predictor
(fly.FDR <- colnames(snppca)[which(fly.qv1 < 0.05)]) ## identify which SNPs these are
## [1] "162922.C" "162922.A" "163269.T" "163269.C"
dim(snppca)
## [1] 151 3100
rm(snppca)
load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/snppca_chr2.RData")
dim(snppca)
## [1] 151 1862322
(fly.FDR.chr2 <- colnames(snppca)[which(fly.qv1.chr2 < 0.05)]) ## identify which SNPs these are
## [1] "3205.G" "3205.A" "13746.A" "13746.G" "20886.T" "20886.A"
## [7] "27909.C" "27909.G" "38816.C" "38816.T" "42047.G" "42047.A"
## [13] "42049.G" "42049.A" "44237.C" "44237.A" "62254.G" "62254.T"
## [19] "77605.T" "77605.C" "84995.C" "84995.A" "87409.C" "87409.A"
## [25] "95522.T" "95522.C" "105438.T" "105438.G" "106073.G" "106073.C"
## [31] "110380.A" "110380.T" "120960.A" "120960.T" "133844.T" "133844.C"
## [37] "163269.T" "163269.C" "169348.G" "169348.C" "170587.G" "170587.A"
## [43] "178736.T" "178736.C" "211689.G" "211689.T" "224757.T" "224757.G"
## [49] "249489.G" "249489.A" "271578.A" "271578.C" "271957.G" "271957.A"
## [55] "293046.G" "293046.A" "295197.T" "295197.C" "295212.C" "295212.T"
## [61] "309911.C" "309911.A" "311205.G" "311205.A" "337949.C" "337949.T"
## [67] "347923.A" "347923.G" "348847.C" "348847.T" "377565.A" "377565.G"
## [73] "387711.G" "387711.A" "388835.C" "388835.T" "415369.G" "415369.T"
## [79] "420886.C" "420886.T" "425629.G" "425629.A" "425633.G" "425633.C"
## [85] "457451.C" "457451.A" "462795.G" "462795.C" "504846.T" "504846.C"
## [91] "529356.T" "529356.G" "529357.G" "529357.C" "533630.C" "533630.T"
## [97] "568526.C" "568526.G" "590620.G" "590620.A" "639512.A" "639512.G"
## [103] "639937.T" "639937.C" "647272.A" "647272.T" "657788.C" "657788.A"
## [109] "699680.C" "699680.A" "703874.A" "703874.T" "711720.C" "711720.A"
## [115] "725241.C" "725241.A" "745747.C" "745747.T" "839278.C" "839278.G"
## [121] "858195.A" "858195.C" "858669.T" "858669.C" "895244.G" "895244.A"
## [127] "899332.G" "899332.A" "900589.T" "900589.C" "913028.T" "913028.A"
## [133] "939029.G" "939029.A" "959209.G" "959209.C" "961471.T" "961471.A"
## [139] "962082.A" "962082.T" "973543.G" "973543.C"
## visualizing results
## Manhattan plot with causal loci
qvalues <- fly.qv1
qvalues.chr2 <- fly.qv1.chr2
row <- as.data.frame(qvalues)
str(row)
## 'data.frame': 3100 obs. of 1 variable:
## $ PC2: num 0.98 0.98 0.954 0.954 0.98 ...
rownames(row) <- 1:nrow(row)
row.chr2 <- as.data.frame(qvalues.chr2)
str(row.chr2)
## 'data.frame': 1862322 obs. of 1 variable:
## $ PC2: num 0.995 0.995 0.995 0.995 0.995 ...
rownames(row.chr2) <- 1:nrow(row.chr2)
(causal.set <- as.numeric(rownames(row)[which(row < 0.05)]))
## [1] 1103 1104 1731 1732
(fly.FDR <- gsub("[^0-9]", "", fly.FDR))
## [1] "162922" "162922" "163269" "163269"
head(afis[afis$variant.id == fly.FDR, c(2, 6, 7, 8, 9, 11, 21, 32)], n = 2) # One of the SNPs should be checked carefully as the position on chromosome 2L does not match the annotations. SNP163269 is at pos 3637803, which falls within the for gene. According to FlyBase, the gene mir-4972 stem loop is referred to Dmel\mir-4972 (CR43508, FBgn0263523). It is a miRNA_gene from Dmel. It has 3 annotated transcripts. Gene sequence location is 2L:3636967..3637073. Its molecular function is unknown. The biological processes in which it is involved are not known. No alleles are reported. [https://flybase.org/reports/FBgn0263523](https://flybase.org/reports/FBgn0263523)
## variant.id col gene chr pos genotype type
## 1: 162922 upstream_gene_variant for 2L 3632475 C,A pooled
## 2: 163269 upstream_gene_variant mir-4972 2L 3637803 T,C pooled
## stationId
## 1: AUM00011389
## 2: AUM00011389
(causal.set.chr2 <- as.numeric(rownames(row.chr2)[which(row.chr2 < 0.05)]))
## [1] 6167 6168 25903 25904 39399 39400 52115 52116 71955
## [10] 71956 77599 77600 77603 77604 81655 81656 115531 115532
## [19] 143587 143588 157179 157180 161669 161670 176903 176904 195495
## [28] 195496 196697 196698 204731 204732 224215 224216 247579 247580
## [37] 301775 301776 313105 313106 315465 315466 330607 330608 391487
## [46] 391488 414733 414734 459215 459216 500307 500308 501029 501030
## [55] 539587 539588 543467 543468 543495 543496 570615 570616 573025
## [64] 573026 622327 622328 640403 640404 642081 642082 695723 695724
## [73] 714473 714474 716593 716594 766177 766178 776383 776384 785203
## [82] 785204 785209 785210 844605 844606 854627 854628 932785 932786
## [91] 978685 978686 978687 978688 986719 986720 1051787 1051788 1093387
## [100] 1093388 1184187 1184188 1184971 1184972 1198467 1198468 1217981 1217982
## [109] 1296481 1296482 1304355 1304356 1318903 1318904 1344395 1344396 1383145
## [118] 1383146 1559575 1559576 1595465 1595466 1596355 1596356 1666265 1666266
## [127] 1674137 1674138 1676511 1676512 1699993 1699994 1748885 1748886 1787129
## [136] 1787130 1791379 1791380 1792525 1792526 1814505 1814506
(fly.FDR.chr2 <- gsub("[^0-9]", "", fly.FDR.chr2))
## [1] "3205" "3205" "13746" "13746" "20886" "20886" "27909" "27909"
## [9] "38816" "38816" "42047" "42047" "42049" "42049" "44237" "44237"
## [17] "62254" "62254" "77605" "77605" "84995" "84995" "87409" "87409"
## [25] "95522" "95522" "105438" "105438" "106073" "106073" "110380" "110380"
## [33] "120960" "120960" "133844" "133844" "163269" "163269" "169348" "169348"
## [41] "170587" "170587" "178736" "178736" "211689" "211689" "224757" "224757"
## [49] "249489" "249489" "271578" "271578" "271957" "271957" "293046" "293046"
## [57] "295197" "295197" "295212" "295212" "309911" "309911" "311205" "311205"
## [65] "337949" "337949" "347923" "347923" "348847" "348847" "377565" "377565"
## [73] "387711" "387711" "388835" "388835" "415369" "415369" "420886" "420886"
## [81] "425629" "425629" "425633" "425633" "457451" "457451" "462795" "462795"
## [89] "504846" "504846" "529356" "529356" "529357" "529357" "533630" "533630"
## [97] "568526" "568526" "590620" "590620" "639512" "639512" "639937" "639937"
## [105] "647272" "647272" "657788" "657788" "699680" "699680" "703874" "703874"
## [113] "711720" "711720" "725241" "725241" "745747" "745747" "839278" "839278"
## [121] "858195" "858195" "858669" "858669" "895244" "895244" "899332" "899332"
## [129] "900589" "900589" "913028" "913028" "939029" "939029" "959209" "959209"
## [137] "961471" "961471" "962082" "962082" "973543" "973543"
head(afis[afis$variant.id == fly.FDR.chr2, c(2, 6, 7, 8, 9, 11, 21, 32)], n = 2)
## variant.id col gene chr pos genotype type
## 1: 163269 upstream_gene_variant mir-4972 2L 3637803 T,C inbred_line
## 2: 163269 upstream_gene_variant mir-4972 2L 3637803 T,C pooled
## stationId
## 1: CHM00054511
## 2: USW00004780
data.frame(causal.set.chr2, fly.FDR.chr2)
## causal.set.chr2 fly.FDR.chr2
## 1 6167 3205
## 2 6168 3205
## 3 25903 13746
## 4 25904 13746
## 5 39399 20886
## 6 39400 20886
## 7 52115 27909
## 8 52116 27909
## 9 71955 38816
## 10 71956 38816
## 11 77599 42047
## 12 77600 42047
## 13 77603 42049
## 14 77604 42049
## 15 81655 44237
## 16 81656 44237
## 17 115531 62254
## 18 115532 62254
## 19 143587 77605
## 20 143588 77605
## 21 157179 84995
## 22 157180 84995
## 23 161669 87409
## 24 161670 87409
## 25 176903 95522
## 26 176904 95522
## 27 195495 105438
## 28 195496 105438
## 29 196697 106073
## 30 196698 106073
## 31 204731 110380
## 32 204732 110380
## 33 224215 120960
## 34 224216 120960
## 35 247579 133844
## 36 247580 133844
## 37 301775 163269
## 38 301776 163269
## 39 313105 169348
## 40 313106 169348
## 41 315465 170587
## 42 315466 170587
## 43 330607 178736
## 44 330608 178736
## 45 391487 211689
## 46 391488 211689
## 47 414733 224757
## 48 414734 224757
## 49 459215 249489
## 50 459216 249489
## 51 500307 271578
## 52 500308 271578
## 53 501029 271957
## 54 501030 271957
## 55 539587 293046
## 56 539588 293046
## 57 543467 295197
## 58 543468 295197
## 59 543495 295212
## 60 543496 295212
## 61 570615 309911
## 62 570616 309911
## 63 573025 311205
## 64 573026 311205
## 65 622327 337949
## 66 622328 337949
## 67 640403 347923
## 68 640404 347923
## 69 642081 348847
## 70 642082 348847
## 71 695723 377565
## 72 695724 377565
## 73 714473 387711
## 74 714474 387711
## 75 716593 388835
## 76 716594 388835
## 77 766177 415369
## 78 766178 415369
## 79 776383 420886
## 80 776384 420886
## 81 785203 425629
## 82 785204 425629
## 83 785209 425633
## 84 785210 425633
## 85 844605 457451
## 86 844606 457451
## 87 854627 462795
## 88 854628 462795
## 89 932785 504846
## 90 932786 504846
## 91 978685 529356
## 92 978686 529356
## 93 978687 529357
## 94 978688 529357
## 95 986719 533630
## 96 986720 533630
## 97 1051787 568526
## 98 1051788 568526
## 99 1093387 590620
## 100 1093388 590620
## 101 1184187 639512
## 102 1184188 639512
## 103 1184971 639937
## 104 1184972 639937
## 105 1198467 647272
## 106 1198468 647272
## 107 1217981 657788
## 108 1217982 657788
## 109 1296481 699680
## 110 1296482 699680
## 111 1304355 703874
## 112 1304356 703874
## 113 1318903 711720
## 114 1318904 711720
## 115 1344395 725241
## 116 1344396 725241
## 117 1383145 745747
## 118 1383146 745747
## 119 1559575 839278
## 120 1559576 839278
## 121 1595465 858195
## 122 1595466 858195
## 123 1596355 858669
## 124 1596356 858669
## 125 1666265 895244
## 126 1666266 895244
## 127 1674137 899332
## 128 1674138 899332
## 129 1676511 900589
## 130 1676512 900589
## 131 1699993 913028
## 132 1699994 913028
## 133 1748885 939029
## 134 1748886 939029
## 135 1787129 959209
## 136 1787130 959209
## 137 1791379 961471
## 138 1791380 961471
## 139 1792525 962082
## 140 1792526 962082
## 141 1814505 973543
## 142 1814506 973543
#snps.sel.chr2 <- unique(fly.FDR.chr2)
#save(snps.sel.chr2, file = "snps.sel.chr2")
quartz()
layout(matrix(c(0, 1, 1, 0,
0, 1, 1, 0,
0, 2, 2, 0,
0, 2, 2, 0), nrow = 4, ncol = 4, byrow = TRUE))
plot(-log10(qvalues), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1, ylim = c(0, 2), main = "for")
points(causal.set, -log10(qvalues)[causal.set], pch = 19, col = alpha("red", 0.3))
text(causal.set, (-log10(qvalues)[causal.set] - 0.1), fly.FDR, col = "red")
plot(-log10(qvalues.chr2), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1, ylim = c(0, 3), main = "Chr 2L")
points(causal.set.chr2[c(37, 38)], -log10(qvalues.chr2)[causal.set.chr2[c(37, 38)]], pch = 19, col = alpha("red", 0.3))
text(causal.set.chr2[c(37, 38)], (-log10(qvalues.chr2)[causal.set.chr2[c(37, 38)]] - 0.1), fly.FDR.chr2[c(37, 38)], col = "red", cex = 0.5)
quartz()
## png("adaptation_revision.png", width = 7, height = 7, units = "in", res = 300)
## pdf("adaptation_revision.pdf")
layout(matrix(c(1, 1, 2, 2,
1, 1, 2, 2,
3, 3, 4, 4,
3, 3, 4, 4), nrow = 4, ncol = 4, byrow = FALSE))
hist(fly.pv1$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "for", ylim = c(0, 350))
legend("top", paste("K = 9\nGIF = ", round(fly.pv1$gif, 3)), bty = "n")
mtext("(a)", side = 2, at = 365, line = 2.5, las = 1, font = 2, family = "serif")
plot(-log10(qvalues), pch = 19, cex = 1, bg = "gray", col = alpha("gray", 0.6), xlab = "SNP", las = 1, ylim = c(0, 2))
points(causal.set, -log10(qvalues)[causal.set], pch = 19, col = "black")
text(causal.set, (-log10(qvalues)[causal.set] - 0.15), fly.FDR, col = "black")
hist(fly.pv1.chr2$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "Chr 2L", ylim = c(0, 150000))
legend("top", paste("K = 8\nGIF = ", round(fly.pv1.chr2$gif, 3)), bty = "n")
mtext("(b)", side = 2, at = 158010, line = 4, las = 1, font = 2, family = "serif")
plot(-log10(qvalues.chr2), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1, ylim = c(0, 3), main = "")
points(causal.set.chr2[-c(37, 38)], -log10(qvalues.chr2)[causal.set.chr2[-c(37, 38)]], pch = 19, col = "black")
points(causal.set.chr2[c(37, 38)], -log10(qvalues.chr2)[causal.set.chr2[c(37, 38)]], pch = 19, col = "red")
text(causal.set.chr2[c(37, 38)], (-log10(qvalues.chr2)[causal.set.chr2[c(37, 38)]] - 0.13), fly.FDR.chr2[c(37, 38)], col = "red", cex = 0.8)
## dev.off()
Figure 5. Genotype-environment association test based on a latent factor mixed model with ridge penalty. (a) Distribution of adjusted p-values using the default genomic inflation factor, and the for loci (SNPs) potentially affected by the PC2 predictor variable. Likewise, (b) displays the same analysis for the entire chromosome 2L. Bold dots represent loci considered to be under selection according to a False Discovery Rate of 0.05.